Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve performance #12

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Cuckoo Filter

[![GitHub go.mod Go version of a Go module](https://img.shields.io/github/go-mod/go-version/panmari/cuckoofilter.svg)](https://github.com/panmari/cuckoofilter)
[![GoDoc](https://godoc.org/github.com/panmari/cuckoofilter?status.svg)](https://godoc.org/github.com/panmari/cuckoofilter)
[![GoReportCard](https://goreportcard.com/badge/github.com/panmari/cuckoofilter)](https://goreportcard.com/report/github.com/panmari/cuckoofilter)
[![GitHub go.mod Go version of a Go module](https://img.shields.io/github/go-mod/go-version/livekit/cuckoofilter.svg)](https://github.com/livekit/cuckoofilter)
[![GoDoc](https://godoc.org/github.com/livekit/cuckoofilter?status.svg)](https://godoc.org/github.com/livekit/cuckoofilter)
[![GoReportCard](https://goreportcard.com/badge/github.com/livekit/cuckoofilter)](https://goreportcard.com/report/github.com/livekit/cuckoofilter)

Well-tuned, production-ready cuckoo filter that performs best in class for low false positive rates (at around 0.01%). For details, see [full evaluation](https://panmari.github.io/2020/10/09/probabilistic-filter-golang.html).

Expand Down Expand Up @@ -35,7 +35,7 @@ With the 16 bit fingerprint size in this repository, you can expect `r ~= 0.0001
import (
"fmt"

cuckoo "github.com/panmari/cuckoofilter"
cuckoo "github.com/livekit/cuckoofilter"
)

func Example() {
Expand All @@ -57,6 +57,6 @@ func Example() {
}
```

For more examples, see [the example tests](https://github.com/panmari/cuckoofilter/blob/master/example_test.go).
Operations on a filter are not thread safe by default.
For more examples, see [the example tests](https://github.com/livekit/cuckoofilter/blob/master/example_test.go).
Operations on a filter are not thread safe by default.
See [this example](example_threadsafe_test.go) for using the filter concurrently.
45 changes: 22 additions & 23 deletions bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@ package cuckoo
import (
"bytes"
"fmt"
"math/bits"
)

// fingerprint represents a single entry in a bucket.
type fingerprint uint16

// bucket keeps track of fingerprints hashing to the same index.
type bucket [bucketSize]fingerprint
type bucket uint64

const (
nullFp = 0
bucketSize = 4
fingerprintSizeBits = 16
maxFingerprint = (1 << fingerprintSizeBits) - 1
Expand All @@ -21,48 +21,47 @@ const (
// insert a fingerprint into a bucket. Returns true if there was enough space and insertion succeeded.
// Note it allows inserting the same fingerprint multiple times.
func (b *bucket) insert(fp fingerprint) bool {
for i, tfp := range b {
if tfp == nullFp {
b[i] = fp
return true
}
if i := findZeros(uint64(*b)); i != 0 {
*b |= bucket(fp) << (bits.Len64(i) - fingerprintSizeBits)
return true
}
return false
}

// delete a fingerprint from a bucket.
// Returns true if the fingerprint was present and successfully removed.
func (b *bucket) delete(fp fingerprint) bool {
for i, tfp := range b {
if tfp == fp {
b[i] = nullFp
return true
}
if i := findValue(uint64(*b), uint16(fp)); i != 0 {
*b &= ^(maxFingerprint << (bits.Len64(i) - fingerprintSizeBits))
return true
}
return false
}

func (b *bucket) swap(i uint64, fp fingerprint) fingerprint {
p := (*b) >> (i * fingerprintSizeBits) & maxFingerprint
*b = (*b) & ^(maxFingerprint<<(i*fingerprintSizeBits)) | (bucket(fp) << (i * fingerprintSizeBits))
return fingerprint(p)
}

func (b *bucket) contains(needle fingerprint) bool {
for _, fp := range b {
if fp == needle {
return true
}
}
return false
return findValue(uint64(*b), uint16(needle)) != 0
}

func (b *bucket) nullsCount() uint {
return uint(bits.OnesCount64(findZeros(uint64(*b))))
}

// reset deletes all fingerprints in the bucket.
func (b *bucket) reset() {
for i := range b {
b[i] = nullFp
}
*b = 0
}

func (b *bucket) String() string {
var buf bytes.Buffer
buf.WriteString("[")
for _, by := range b {
buf.WriteString(fmt.Sprintf("%5d ", by))
for i := 3; i >= 0; i-- {
buf.WriteString(fmt.Sprintf("%5d ", ((*b)>>(i*fingerprintSizeBits))&maxFingerprint))
}
buf.WriteString("]")
return buf.String()
Expand Down
55 changes: 54 additions & 1 deletion bucket_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,65 @@ import (
func TestBucket_Reset(t *testing.T) {
var bkt bucket
for i := fingerprint(0); i < bucketSize; i++ {
bkt[i] = i
bkt.insert(i + 1)
}

bkt.reset()

var want bucket
if !reflect.DeepEqual(bkt, want) {
t.Errorf("bucket.reset() got %v, want %v", bkt, want)
}
}

func TestBucket_Insert(t *testing.T) {
var bkt bucket
for i := fingerprint(0); i < bucketSize; i++ {
if !bkt.insert(i + 1) {
t.Error("bucket insert failed")
}
}
if bkt.insert(5) {
t.Error("expected bucket insert to fail after overflow")
}
}

func TestBucket_Delete(t *testing.T) {
var bkt bucket
for i := fingerprint(0); i < bucketSize; i++ {
bkt.insert(i + 1)
}

for i := fingerprint(0); i < bucketSize; i++ {
if !bkt.delete(i + 1) {
t.Error("bucket delete failed")
}
if !bkt.insert(i + 1) {
t.Error("bucket insert after delete failed")
}
}
}

func TestBucket_Swap(t *testing.T) {
var bkt bucket
bkt.insert(123)
if prev := bkt.swap(3, 321); prev != 123 {
t.Errorf("swap returned unexpected value %d", prev)
}
if !bkt.contains(321) {
t.Errorf("contains after swap failed")
}
}

func TestBucket_Contains(t *testing.T) {
var bkt bucket
for i := fingerprint(0); i < bucketSize; i++ {
bkt.insert(i + 1)
}

for i := fingerprint(0); i < bucketSize; i++ {
if !bkt.contains(i + 1) {
t.Error("bucket contains failed")
}
}
}
62 changes: 32 additions & 30 deletions cuckoofilter.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package cuckoo

import (
"bytes"
"encoding/binary"
"fmt"
"math/rand"

"github.com/zeebo/wyhash"
)

// maxCuckooKickouts is the maximum number of times reinsert
Expand All @@ -18,25 +18,31 @@ type Filter struct {
// Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2,
// applying this mask mimics the operation x % len(buckets).
bucketIndexMask uint
rng wyhash.RNG
}

// NewFilter returns a new cuckoofilter suitable for the given number of elements.
// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether.
// A capacity of 1000000 is a normal default, which allocates
// about ~2MB on 64-bit machines.
func NewFilter(numElements uint) *Filter {
numBuckets := getNextPow2(uint64(numElements / bucketSize))
if float64(numElements)/float64(numBuckets*bucketSize) > 0.96 {
numBuckets <<= 1
paddedNumElements := getNextPow2(uint64(numElements/bucketSize)) * bucketSize
if float64(numElements)/float64(paddedNumElements) > 0.96 {
paddedNumElements <<= 1
}
return NewFilterWithoutPadding(paddedNumElements)
}

func NewFilterWithoutPadding(numElements uint) *Filter {
numBuckets := getNextPow2(uint64(numElements / bucketSize))
if numBuckets == 0 {
numBuckets = 1
}
buckets := make([]bucket, numBuckets)
return &Filter{
buckets: buckets,
count: 0,
bucketIndexMask: uint(len(buckets) - 1),
bucketIndexMask: numBuckets - 1,
}
}

Expand Down Expand Up @@ -72,7 +78,11 @@ func (cf *Filter) Insert(data []byte) bool {
if cf.insert(fp, i2) {
return true
}
return cf.reinsert(fp, randi(i1, i2))
if cf.rng.Uint64()&1 == 0 {
return cf.reinsert(fp, i1)
} else {
return cf.reinsert(fp, i2)
}
}

func (cf *Filter) insert(fp fingerprint, i uint) bool {
Expand All @@ -85,9 +95,9 @@ func (cf *Filter) insert(fp fingerprint, i uint) bool {

func (cf *Filter) reinsert(fp fingerprint, i uint) bool {
for k := 0; k < maxCuckooKickouts; k++ {
j := rand.Intn(bucketSize)
j := cf.rng.Uint64() & (bucketSize - 1)
// Swap fingerprint with bucket entry.
cf.buckets[i][j], fp = fp, cf.buckets[i][j]
fp = cf.buckets[i].swap(j, fp)

// Move kicked out fingerprint to alternate location.
i = getAltIndex(fp, i, cf.bucketIndexMask)
Expand Down Expand Up @@ -127,44 +137,36 @@ const bytesPerBucket = bucketSize * fingerprintSizeBits / 8

// Encode returns a byte slice representing a Cuckoofilter.
func (cf *Filter) Encode() []byte {
res := new(bytes.Buffer)
res.Grow(len(cf.buckets) * bytesPerBucket)

buf := make([]byte, 0, len(cf.buckets)*bytesPerBucket+8)
for _, b := range cf.buckets {
for _, fp := range b {
binary.Write(res, binary.LittleEndian, fp)
}
buf = binary.LittleEndian.AppendUint64(buf, uint64(b))
}
return res.Bytes()
buf = binary.LittleEndian.AppendUint64(buf, uint64(cf.rng))
return buf
}

// Decode returns a Cuckoofilter from a byte slice created using Encode.
func Decode(data []byte) (*Filter, error) {
if len(data)%bucketSize != 0 {
return nil, fmt.Errorf("bytes must to be multiple of %d, got %d", bucketSize, len(data))
}
numBuckets := len(data) / bytesPerBucket
numBuckets := (len(data) - 8) / bytesPerBucket
if numBuckets < 1 {
return nil, fmt.Errorf("bytes can not be smaller than %d, size in bytes is %d", bytesPerBucket, len(data))
}
if getNextPow2(uint64(numBuckets)) != uint(numBuckets) {
return nil, fmt.Errorf("numBuckets must to be a power of 2, got %d", numBuckets)
}
var count uint

var count, pos uint
buckets := make([]bucket, numBuckets)
reader := bytes.NewReader(data)

for i, b := range buckets {
for j := range b {
binary.Read(reader, binary.LittleEndian, &buckets[i][j])
if buckets[i][j] != nullFp {
count++
}
}
for i := range buckets {
buckets[i] = bucket(binary.LittleEndian.Uint64(data[pos:]))
pos += 8
count += bucketSize - buckets[i].nullsCount()
}
rng := binary.LittleEndian.Uint64(data[pos:])
return &Filter{
buckets: buckets,
count: count,
bucketIndexMask: uint(len(buckets) - 1),
rng: wyhash.RNG(rng),
}, nil
}
12 changes: 3 additions & 9 deletions cuckoofilter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,15 +239,9 @@ func TestDeleteMultipleSame(t *testing.T) {

func TestEncodeDecode(t *testing.T) {
cf := NewFilter(10)
cf.Insert([]byte{1})
cf.Insert([]byte{2})
cf.Insert([]byte{3})
cf.Insert([]byte{4})
cf.Insert([]byte{5})
cf.Insert([]byte{6})
cf.Insert([]byte{7})
cf.Insert([]byte{8})
cf.Insert([]byte{9})
for i := 0; i < 16; i++ {
cf.Insert([]byte{byte(i)})
}
encoded := cf.Encode()
got, err := Decode(encoded)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package cuckoo_test
import (
"fmt"

cuckoo "github.com/panmari/cuckoofilter"
cuckoo "github.com/livekit/cuckoofilter"
)

func Example() {
Expand Down
2 changes: 1 addition & 1 deletion example_threadsafe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"fmt"
"sync"

cuckoo "github.com/panmari/cuckoofilter"
cuckoo "github.com/livekit/cuckoofilter"
)

// Small wrapper around cuckoo filter making it thread safe.
Expand Down
7 changes: 5 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
module github.com/panmari/cuckoofilter
module github.com/livekit/cuckoofilter

go 1.15

require (
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165
github.com/google/go-cmp v0.5.9
github.com/zeebo/wyhash v0.0.1
github.com/zeebo/xxh3 v1.0.2
)

require github.com/klauspost/cpuid/v2 v2.0.9 // indirect
9 changes: 7 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 h1:BS21ZUJ/B5X2UVUbczfmdWH7GapPWAhxcMsDnjJTU1E=
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
github.com/zeebo/wyhash v0.0.1 h1:VEByEMek3iHhV65CgG3SRAWVtg/6TcmbEKj5jPOKDrc=
github.com/zeebo/wyhash v0.0.1/go.mod h1:Ti+OwfNtM5AZiYAL0kOPIfliqDP5c0VtOnnMAqzuuZk=
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
Loading