Skip to content
This repository has been archived by the owner on Jun 30, 2023. It is now read-only.

Commit

Permalink
jar: do not keep large buffers unnecessarily
Browse files Browse the repository at this point in the history
This uses a sync.Pool wrapper (called pool.Dynamic) that prevents the
pool from holding on to very large buffers indefinitely, while still
amortizing the cost of allocation. The policy appears to give good
results both with the pre-existing tests and the specific tests added
for the pool.

This is useful because the library is also used from long-running server
contexts, where it would be unfortunate to pin very large buffers for
too long. See golang/go#23199.

Example algorithm run (from the test):

```
num  allocs   value      target            capacity
1    1        100000     100000.000000     100000
2    1        1          52048.000000      100000
3    1        1          28072.000000      100000
4    1        1          16084.000000      100000
5    1        1          10090.000000      100000
6    1        1          7093.000000       100000
7    2        10         5594.500000       4096
8    2        1          4845.250000       4096
9    2        1          4470.625000       4096
10   2        1          4283.312500       4096
11   2        1          4189.656250       4096
12   2        1          4142.828125       4096
13   2        1          4119.414062       4096
14   2        1          4107.707031       4096
15   2        12         4101.853516       4096
16   2        1          4098.926758       4096
17   2        1          4097.463379       4096
18   2        1          4096.731689       4096
19   2        1          4096.365845       4096
20   2        1          4096.182922       4096
21   2        1          4096.091461       4096
22   2        1          4096.045731       4096
23   2        1000       4096.022865       4096
24   2        100        4096.011433       4096
25   3        10000      10000.000000      10000
26   4        100000     100000.000000     100000
27   4        1          52048.000000      100000
28   4        100000     100000.000000     100000
29   4        1          52048.000000      100000
30   4        50000      51024.000000      100000
31   4        1          27560.000000      100000
32   4        1          15828.000000      100000
33   4        25000      25000.000000      100000
34   4        1          14548.000000      100000
35   4        1          9322.000000       100000
36   5        1          6709.000000       4096
37   6        100000     100000.000000     100000
38   6        1          52048.000000      100000
39   6        1          28072.000000      100000
40   6        1          16084.000000      100000
41   6        1          10090.000000      100000
42   6        1          7093.000000       100000
43   7        1          5594.500000       4096
44   7        1          4845.250000       4096
45   7        1          4470.625000       4096
46   7        1          4283.312500       4096
47   7        100        4189.656250       4096
48   7        100        4142.828125       4096
49   7        100        4119.414062       4096
50   7        1          4107.707031       4096
51   7        1          4101.853516       4096
52   7        1          4098.926758       4096
53   7        1          4097.463379       4096
54   7        1          4096.731689       4096
55   7        100        4096.365845       4096
56   7        200        4096.182922       4096
57   7        300        4096.091461       4096
58   7        100        4096.045731       4096
59   7        50         4096.022865       4096
60   7        50         4096.011433       4096
61   7        50         4096.005716       4096
62   7        50         4096.002858       4096
63   7        50         4096.001429       4096
64   7        1          4096.000715       4096
65   7        1          4096.000357       4096
66   7        1          4096.000179       4096
67   7        1          4096.000089       4096
68   8        100000000  100000000.000000  100000000
69   8        1000000    50500000.000000   100000000
70   8        100000     25300000.000000   100000000
71   8        10000      12655000.000000   100000000
72   8        1000       6329548.000000    100000000
73   9        100        3166822.000000    4096
74   9        10         1585459.000000    4096
75   9        1          794777.500000     4096
76   9        1          399436.750000     4096
77   9        500        201766.375000     4096
78   9        2020       102931.187500     4096
79   9        400        53513.593750      4096
80   9        3984       28804.796875      4096
81   9        5          16450.398438      4096
82   9        200        10273.199219      4096
83   9        500        7184.599609       4096
84   10       40000      40000.000000      40000
85   10       35000      37500.000000      40000
86   11       45000      45000.000000      45000
87   11       42000      43500.000000      45000
88   11       38000      40750.000000      45000
89   11       38000      39375.000000      45000
90   11       39000      39187.500000      45000
91   11       41000      41000.000000      45000
92   11       42000      42000.000000      45000
93   11       42000      42000.000000      45000
94   11       2000       23048.000000      45000
95   11       4000       13572.000000      45000
96   11       3949       8834.000000       45000
97   11       2011       6465.000000       45000
98   11       4096       5280.500000       45000
99   11       33         4688.250000       45000
100  11       0          4392.125000       45000
101  12       4938       4938.000000       4938
102  12       1          4517.000000       4938
103  12       1          4306.500000       4938
104  12       1200       4201.250000       4938
105  12       2400       4148.625000       4938
106  12       1200       4122.312500       4938
107  12       200        4109.156250       4938
108  12       400        4102.578125       4938
109  12       600        4099.289062       4938
110  12       700        4097.644531       4938
111  12       100        4096.822266       4938
112  12       400        4096.411133       4938
113  12       500        4096.205566       4938
114  12       700        4096.102783       4938
115  12       600        4096.051392       4938
116  12       900        4096.025696       4938
117  12       1000       4096.012848       4938
118  12       1100       4096.006424       4938
119  12       1200       4096.003212       4938
120  12       1000       4096.001606       4938
```

Benchmarks also show that the pool does retain the buffer, as
performance is not worsened over the previous commit:

```
$ git checkout main
TMPDIR="$HOME/tmp/tmpdir"
mkdir "$TMPDIR" || true
for file in jar/testdata/* ; do
  RTMPDIR="$TMPDIR/$(basename $file)"
  mkdir "$RTMPDIR" || true
  ln -fv "$PWD/$file" "$RTMPDIR"
done
for commit in $(git log --pretty=oneline | head -5 | awk '{print $1}' | tac) ; do
  git checkout $commit
  go build
  hyperfine --ignore-failure --warmup 1 "./log4jscanner $TMPDIR/400mb_jar_in_jar.jar"
  rm log4jscanner
done
HEAD is now at 48d70bf jar: add benchmarks with 400mb_jar_in_jar.jar
  Time (mean ± σ):      2.026 s ±  0.324 s    [User: 2.363 s, System: 1.269 s]
  Range (min … max):    1.651 s …  2.749 s    10 runs

HEAD is now at bf524fa jar: close the zip.File reader before recursing
  Time (mean ± σ):      1.908 s ±  0.297 s    [User: 2.084 s, System: 1.218 s]
  Range (min … max):    1.502 s …  2.567 s    10 runs

HEAD is now at 4b23cd3 jar: prefer io.ReadFull over io.ReadAll
  Time (mean ± σ):     445.9 ms ±  51.2 ms    [User: 401.7 ms, System: 79.9 ms]
  Range (min … max):   386.3 ms … 566.1 ms    10 runs

HEAD is now at 37376ef jar: reuse buffers for nested .jar's
  Time (mean ± σ):     464.5 ms ±  41.8 ms    [User: 420.5 ms, System: 93.7 ms]
  Range (min … max):   409.2 ms … 545.5 ms    10 runs

HEAD is now at c17a81b jar: do not keep large buffers unnecessarily
  Time (mean ± σ):     436.1 ms ±  26.2 ms    [User: 409.5 ms, System: 77.6 ms]
  Range (min … max):   390.2 ms … 472.7 ms    10 runs
```
  • Loading branch information
aktau authored and ericchiang committed Mar 18, 2022
1 parent 9e4b36f commit fb3bde4
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 6 deletions.
16 changes: 10 additions & 6 deletions jar/jar.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"strings"
"sync"

"github.com/google/log4jscanner/pool"
zipfork "github.com/google/log4jscanner/third_party/zip"
"rsc.io/binaryregexp"
)
Expand Down Expand Up @@ -255,10 +256,9 @@ var (
return make([]byte, bufSize)
},
}
dynBufPool = sync.Pool{
New: func() interface{} {
return make([]byte, bufSize)
},
dynBufPool = pool.Dynamic{
Pool: sync.Pool{New: func() interface{} { return make([]byte, 0) }},
MinUtility: bufSize,
}
)

Expand Down Expand Up @@ -394,7 +394,7 @@ func (c *checker) checkFile(zf *zip.File, depth int, size int64, jar string) err
}
buf := dynBufPool.Get().([]byte)
buf, err = readFull(f, fi, buf)
defer dynBufPool.Put(buf)
defer dynBufPool.Put(buf, float64(len(buf)), float64(cap(buf)))
f.Close() // Recycle the flate buffer earlier, we're going to recurse.
if err != nil {
return fmt.Errorf("read file %s: %v", p, err)
Expand All @@ -419,7 +419,11 @@ func readFull(r io.Reader, fi os.FileInfo, buf []byte) ([]byte, error) {
return io.ReadAll(r) // If not a regular file, size may not be accurate.
}
if size := int(fi.Size()); cap(buf) < size {
buf = make([]byte, size)
capacity := size
if capacity < bufSize {
capacity = bufSize // Allocating much smaller buffers could lead to quick re-allocations.
}
buf = make([]byte, size, capacity)
} else {
buf = buf[:size]
}
Expand Down
67 changes: 67 additions & 0 deletions pool/dynamic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Package pool provides an object pool that trades off the cost of creation
// versus retention. It is meant to avoid the pessimal behaviour (see [issue
// 23199]) seen when using a regular sync.Pool with objects of dynamic sizes;
// objects that are too large are kept alive by repeat usages that don't need
// such sizes.
//
// [issue 23199]: https://github.com/golang/go/issues/23199
package pool

import (
"math"
"sync"
"sync/atomic"
)

// A Dynamic pool is like a sync.Pool for objects of varying sizes.
//
// It prevents the indefinite retention of (too) large objects by keeping a
// history of required object sizes (utility) and comparing them to the actual
// object size (cost) before accepting an object.
type Dynamic struct {
Pool sync.Pool

// The utility below which the cost of creating the object is more expensive
// than just keeping it. Set this to the expected object size (or perhaps a
// bit larger to reduce allocations more).
MinUtility float64

avgUtility uint64 // Actually a float64, but that type does not have atomic ops.
}

func (p *Dynamic) Get() interface{} {
return p.Pool.Get()
}

// Put is like sync.Pool.Put, with a few differences. The utility is a measure
// of what part of the object was actually used. The cost is a measure of the
// total "size" of the object. Utility must be smaller than or equal to cost.
func (p *Dynamic) Put(v interface{}, utility, cost float64) bool {
// Update the average utility. Uses atomic load/store, which means that
// values can get lost if Put is called concurrently. That's fine, we're
// just looking for an approximate (weighted) moving average.
avgUtility := math.Float64frombits(atomic.LoadUint64(&p.avgUtility))
avgUtility = decay(avgUtility, utility, p.MinUtility)
atomic.StoreUint64(&p.avgUtility, math.Float64bits(avgUtility))

if cost > 10*avgUtility {
return false // If the cost is 10x larger than the average utility, drop it.
}
p.Pool.Put(v)
return true
}

// decay updates returns `val` if `val > `prev`, otherwise it returns an
// exponentially moving average of `prev` and `val` (with factor 0.5. This is
// meant to provide a slower downramp if `val` drops ever lower. The minimum
// value is `min`.
func decay(prev, val, min float64) float64 {
if val < min {
val = min
}
if prev == 0 || val > prev {
return val
}
const factor = 0.5
return (prev * factor) + (val * (1 - factor))
}
80 changes: 80 additions & 0 deletions pool/dynamic_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package pool

import (
"math"
"sync"
"sync/atomic"
"testing"
)

const bufSize = 4096

// The desired behaviour of the dynamic (buffer) pool is:
// - Don't retain (very) large items indefinitely (check that one is rejected
// at least once).
// - Do retain even large items for a while so their allocation cost is
// amortized.
func TestDynamic(t *testing.T) {
dp := Dynamic{
Pool: sync.Pool{New: func() interface{} { return make([]byte, 0) }},
MinUtility: bufSize,
}
var allocs int
// Simulate a sequence of file sizes. This sequence is not based on some
// real-life observed sequence of sizes of jar-in-jars. It might be better
// to use such a sequence, but every organisation will have its own expected
// sizes and this synthetic one conains some fairly extreme samples that
// check whether the algorithm is robust.
//
// For the current algorithm, the worst possible sequence is one that
// rises, then suddenly drops and then rises slowly again. We contend that
// this case is rare.
sizes := [...]int{
100000, 1, 1, 1, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 1, 1, 1, 1,
1000, 100, 10000, 100000, 1, 100000, 1, 50000, 1, 1, 25000, 1, 1, 1,
100000, 1, 1, 1, 1, 1, 1, 1, 1, 1, 100, 100, 100, 1, 1, 1, 1, 1, 100,
200, 300, 100, 50, 50, 50, 50, 50, 1, 1, 1, 1, 100000000, 1000000,
100000, 10000, 1000, 100, 10, 1, 1, 500, 2020, 400, 3984, 5, 200, 500,
40000, 35000, 45000, 42000, 38000, 38000, 39000, 41000, 42000, 42000, // Average: 40000
2000, 4000, 3949, 2011, 4096, 33, 0, 4938, 1, 1, 1200, 2400, 1200, 200,
400, 600, 700, 100, 400, 500, 700, 600, 900, 1000, 1100, 1200, 1000,
}

var largeBufferPurged int

t.Logf("num allocs value target capacity")
// This test assumes (with some margin for error) that back-to-back Put/Get
// on a pool from a single goroutine yield the same item. I believe this to
// be a fairly stable assumption avoiding plenty of testing boilerplate,
// time will tell.
for idx, size := range sizes {
buf := dp.Get().([]byte)
if cap(buf) < size {
capacity := size
if capacity < bufSize {
capacity = bufSize // Allocating much smaller buffers could lead to quick re-allocations.
}
buf = make([]byte, size, capacity)
allocs++
} else {
buf = buf[:size]
}
utility := float64(len(buf))
if utility < bufSize {
utility = bufSize
}
if !dp.Put(buf, utility, float64(cap(buf))) && cap(buf) >= 100000 {
largeBufferPurged++
}
avgUtility := math.Float64frombits(atomic.LoadUint64(&dp.avgUtility))
t.Logf("%d %d %d %f %d", idx+1, allocs, size, avgUtility, cap(buf))
}
// Before the amortized buffer optimization, each iteration would've been
// one allocation. We want at least 10x fewer than that.
if got, want := allocs, len(sizes)/10; got > want {
t.Errorf("got %d allocations, wanted %d", got, want)
}
if got, atLeast := largeBufferPurged, 2; got < atLeast {
t.Errorf("buffers >= 100000 have been rejected %d times, expected at least %d", got, atLeast)
}
}

0 comments on commit fb3bde4

Please sign in to comment.