Skip to content

Commit

Permalink
cryptocore: prefetch nonces in 512-byte blocks
Browse files Browse the repository at this point in the history
On my machine, reading 512-byte blocks from /dev/urandom
(same via getentropy syscall) is a lot faster in terms of
throughput:

Blocksize    Throughput
 16          28.18 MB/s
512          83.75 MB/s

For a single-threaded streaming write, this drops the CPU usage of
nonceGenerator.Get to almost 1/3:

        flat  flat%   sum%        cum   cum%
Before     0     0% 95.08%      0.35s  2.92%  github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get
After  0.01s 0.092% 92.34%      0.13s  1.20%  github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get

This change makes the nonce reading single-threaded, which may
hurt massively-parallel writes.
  • Loading branch information
rfjakob committed Jun 9, 2017
1 parent da1bd74 commit 80516ed
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 2 deletions.
3 changes: 1 addition & 2 deletions internal/cryptocore/nonce.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,5 @@ type nonceGenerator struct {

// Get a random "nonceLen"-byte nonce
func (n *nonceGenerator) Get() []byte {
nonce := RandBytes(n.nonceLen)
return nonce
return randPrefetcher.read(n.nonceLen)
}
50 changes: 50 additions & 0 deletions internal/cryptocore/randprefetch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package cryptocore

import (
"bytes"
"log"
"sync"
)

/*
Number of bytes to prefetch.
512 looks like a good compromise between throughput and latency:
Benchmark16-2 3000000 567 ns/op 28.18 MB/s
Benchmark64-2 5000000 293 ns/op 54.51 MB/s
Benchmark128-2 10000000 220 ns/op 72.48 MB/s
Benchmark256-2 10000000 210 ns/op 76.17 MB/s
Benchmark512-2 10000000 191 ns/op 83.75 MB/s
Benchmark1024-2 10000000 171 ns/op 93.48 MB/s
Benchmark2048-2 10000000 165 ns/op 96.45 MB/s
Benchmark4096-2 10000000 165 ns/op 96.58 MB/s
Benchmark40960-2 10000000 147 ns/op 108.82 MB/s
*/
const prefetchN = 512

type randPrefetcherT struct {
sync.Mutex
buf bytes.Buffer
}

func (r *randPrefetcherT) read(want int) (out []byte) {
out = make([]byte, want)
r.Lock()
// Note: don't use defer, it slows us down!
have, err := r.buf.Read(out)
if have == want && err == nil {
r.Unlock()
return out
}
// Buffer was empty -> re-fill
r.buf.Reset()
r.buf.Write(RandBytes(prefetchN))
have, err = r.buf.Read(out)
if have != want || err != nil {
log.Panicf("randPrefetcher could not satisfy read: have=%d want=%d err=%v", have, want, err)
}
r.Unlock()
return out
}

var randPrefetcher randPrefetcherT
40 changes: 40 additions & 0 deletions internal/cryptocore/randprefetch_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package cryptocore

import (
"bytes"
"compress/flate"
"runtime"
"sync"
"testing"
)

// TestRandPrefetch hammers the randPrefetcher with 100 goroutines and verifies
// that the result is incompressible
func TestRandPrefetch(t *testing.T) {
runtime.GOMAXPROCS(10)
p := 100
l := 200
vec := make([][]byte, p)
var wg sync.WaitGroup
for i := 0; i < p; i++ {
wg.Add(1)
go func(i int) {
var tmp []byte
for x := 0; x < l; x++ {
tmp = append(tmp, randPrefetcher.read(l)...)
}
vec[i] = tmp
wg.Done()
}(i)
}
wg.Wait()
var b bytes.Buffer
fw, _ := flate.NewWriter(&b, flate.BestCompression)
for _, v := range vec {
fw.Write(v)
}
fw.Close()
if b.Len() < p*l*l {
t.Errorf("random data should be incompressible, but: in=%d compressed=%d\n", p*l*l, b.Len())
}
}

0 comments on commit 80516ed

Please sign in to comment.