From 80516ed3351477793eec882508969b6b29b69b0a Mon Sep 17 00:00:00 2001 From: Jakob Unterwurzacher Date: Fri, 9 Jun 2017 21:52:26 +0200 Subject: [PATCH] cryptocore: prefetch nonces in 512-byte blocks On my machine, reading 512-byte blocks from /dev/urandom (same via getentropy syscall) is a lot faster in terms of throughput: Blocksize Throughput 16 28.18 MB/s 512 83.75 MB/s For a single-threaded streaming write, this drops the CPU usage of nonceGenerator.Get to almost 1/3: flat flat% sum% cum cum% Before 0 0% 95.08% 0.35s 2.92% github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get After 0.01s 0.092% 92.34% 0.13s 1.20% github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get This change makes the nonce reading single-threaded, which may hurt massively-parallel writes. --- internal/cryptocore/nonce.go | 3 +- internal/cryptocore/randprefetch.go | 50 ++++++++++++++++++++++++ internal/cryptocore/randprefetch_test.go | 40 +++++++++++++++++++ 3 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 internal/cryptocore/randprefetch.go create mode 100644 internal/cryptocore/randprefetch_test.go diff --git a/internal/cryptocore/nonce.go b/internal/cryptocore/nonce.go index 412cdbbe..9df094cb 100644 --- a/internal/cryptocore/nonce.go +++ b/internal/cryptocore/nonce.go @@ -28,6 +28,5 @@ type nonceGenerator struct { // Get a random "nonceLen"-byte nonce func (n *nonceGenerator) Get() []byte { - nonce := RandBytes(n.nonceLen) - return nonce + return randPrefetcher.read(n.nonceLen) } diff --git a/internal/cryptocore/randprefetch.go b/internal/cryptocore/randprefetch.go new file mode 100644 index 00000000..8825a053 --- /dev/null +++ b/internal/cryptocore/randprefetch.go @@ -0,0 +1,50 @@ +package cryptocore + +import ( + "bytes" + "log" + "sync" +) + +/* +Number of bytes to prefetch. + +512 looks like a good compromise between throughput and latency: +Benchmark16-2 3000000 567 ns/op 28.18 MB/s +Benchmark64-2 5000000 293 ns/op 54.51 MB/s +Benchmark128-2 10000000 220 ns/op 72.48 MB/s +Benchmark256-2 10000000 210 ns/op 76.17 MB/s +Benchmark512-2 10000000 191 ns/op 83.75 MB/s +Benchmark1024-2 10000000 171 ns/op 93.48 MB/s +Benchmark2048-2 10000000 165 ns/op 96.45 MB/s +Benchmark4096-2 10000000 165 ns/op 96.58 MB/s +Benchmark40960-2 10000000 147 ns/op 108.82 MB/s +*/ +const prefetchN = 512 + +type randPrefetcherT struct { + sync.Mutex + buf bytes.Buffer +} + +func (r *randPrefetcherT) read(want int) (out []byte) { + out = make([]byte, want) + r.Lock() + // Note: don't use defer, it slows us down! + have, err := r.buf.Read(out) + if have == want && err == nil { + r.Unlock() + return out + } + // Buffer was empty -> re-fill + r.buf.Reset() + r.buf.Write(RandBytes(prefetchN)) + have, err = r.buf.Read(out) + if have != want || err != nil { + log.Panicf("randPrefetcher could not satisfy read: have=%d want=%d err=%v", have, want, err) + } + r.Unlock() + return out +} + +var randPrefetcher randPrefetcherT diff --git a/internal/cryptocore/randprefetch_test.go b/internal/cryptocore/randprefetch_test.go new file mode 100644 index 00000000..2a568f31 --- /dev/null +++ b/internal/cryptocore/randprefetch_test.go @@ -0,0 +1,40 @@ +package cryptocore + +import ( + "bytes" + "compress/flate" + "runtime" + "sync" + "testing" +) + +// TestRandPrefetch hammers the randPrefetcher with 100 goroutines and verifies +// that the result is incompressible +func TestRandPrefetch(t *testing.T) { + runtime.GOMAXPROCS(10) + p := 100 + l := 200 + vec := make([][]byte, p) + var wg sync.WaitGroup + for i := 0; i < p; i++ { + wg.Add(1) + go func(i int) { + var tmp []byte + for x := 0; x < l; x++ { + tmp = append(tmp, randPrefetcher.read(l)...) + } + vec[i] = tmp + wg.Done() + }(i) + } + wg.Wait() + var b bytes.Buffer + fw, _ := flate.NewWriter(&b, flate.BestCompression) + for _, v := range vec { + fw.Write(v) + } + fw.Close() + if b.Len() < p*l*l { + t.Errorf("random data should be incompressible, but: in=%d compressed=%d\n", p*l*l, b.Len()) + } +}