From 80b543bfe6925262225ebfa7c9a82a8c06b29621 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Mon, 13 Mar 2023 08:46:10 +0100 Subject: [PATCH] zstd: Speed up + improve best encoder (#776) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name old speed new speed delta Encoder_EncodeAllSimple/best-8 14.8MB/s ± 3% 20.7MB/s ± 3% +39.53% (p=0.000 n=17+19) Encoder_EncodeAllSimple4K/best-8 11.8MB/s ± 1% 19.2MB/s ± 6% +62.17% (p=0.000 n=20+20) name old alloc/op new alloc/op delta Encoder_EncodeAllSimple/best-8 14.0B ± 0% 10.2B ± 8% -27.07% (p=0.000 n=16+19) Encoder_EncodeAllSimple4K/best-8 1.00B ± 0% 0.00B -100.00% (p=0.000 n=20+19) Also, compressing enwik9 takes 6.375% less wall clock time. Output from silesia corpus and enwik9 is about .05% bigger, due to the different order in which comparisons are done: dickens 3222189 3220994 (× 0.99963) enwik9 259699309 259846164 (× 1.00057) mozilla 16912341 16912437 (× 1.00001) mr 3505553 3502823 (× 0.99922) nci 2289871 2306320 (× 1.00718) ooffice 2896410 2896907 (× 1.00017) osdb 3390871 3390548 (× 0.99990) reymont 1656006 1657380 (× 1.00083) samba 4326783 4329898 (× 1.00072) sao 5416932 5416648 (× 0.99995) webster 9966351 9972808 (× 1.00065) xml 538378 542277 (× 1.00724) x-ray 5733061 5733121 (× 1.00001) total 319554055 319728325 (× 1.00055) This is still smaller than before #705. --- zstd/enc_best.go | 65 ++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/zstd/enc_best.go b/zstd/enc_best.go index 830f5ba74a..07f657d36e 100644 --- a/zstd/enc_best.go +++ b/zstd/enc_best.go @@ -32,7 +32,6 @@ type match struct { length int32 rep int32 est int32 - _ [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes } const highScore = 25000 @@ -189,12 +188,6 @@ encodeLoop: panic("offset0 was 0") } - bestOf := func(a, b *match) *match { - if a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 { - return a - } - return b - } const goodEnough = 100 nextHashL := hashLen(cv, bestLongTableBits, bestLongLen) @@ -202,40 +195,41 @@ encodeLoop: candidateL := e.longTable[nextHashL] candidateS := e.table[nextHashS] - matchAt := func(offset int32, s int32, first uint32, rep int32) match { + // Set m to a match at offset if it looks like that will improve compression. + improve := func(m *match, offset int32, s int32, first uint32, rep int32) { if s-offset >= e.maxMatchOff || load3232(src, offset) != first { - return match{s: s, est: highScore} + return } if debugAsserts { if !bytes.Equal(src[s:s+4], src[offset:offset+4]) { panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first)) } } - m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep} - m.estBits(bitsPerByte) - return m + cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep} + cand.estBits(bitsPerByte) + if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 { + *m = cand + } } - m1 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1) - m2 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1) - m3 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1) - m4 := matchAt(candidateS.prev-e.cur, s, uint32(cv), -1) - best := bestOf(bestOf(&m1, &m2), bestOf(&m3, &m4)) + best := match{s: s, est: highScore} + improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1) + improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1) + improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1) + improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1) if canRepeat && best.length < goodEnough { cv32 := uint32(cv >> 8) spp := s + 1 - m1 := matchAt(spp-offset1, spp, cv32, 1) - m2 := matchAt(spp-offset2, spp, cv32, 2) - m3 := matchAt(spp-offset3, spp, cv32, 3) - best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3)) + improve(&best, spp-offset1, spp, cv32, 1) + improve(&best, spp-offset2, spp, cv32, 2) + improve(&best, spp-offset3, spp, cv32, 3) if best.length > 0 { cv32 = uint32(cv >> 24) spp += 2 - m1 := matchAt(spp-offset1, spp, cv32, 1) - m2 := matchAt(spp-offset2, spp, cv32, 2) - m3 := matchAt(spp-offset3, spp, cv32, 3) - best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3)) + improve(&best, spp-offset1, spp, cv32, 1) + improve(&best, spp-offset2, spp, cv32, 2) + improve(&best, spp-offset3, spp, cv32, 3) } } // Load next and check... @@ -262,18 +256,16 @@ encodeLoop: candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)] // Short at s+1 - m1 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1) + improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1) // Long at s+1, s+2 - m2 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1) - m3 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1) - m4 := matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1) - m5 := matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1) - best = bestOf(bestOf(bestOf(best, &m1), &m2), bestOf(bestOf(&m3, &m4), &m5)) + improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1) + improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1) + improve(&best, candidateL2.offset-e.cur, s+1, uint32(cv2), -1) + improve(&best, candidateL2.prev-e.cur, s+1, uint32(cv2), -1) if false { // Short at s+3. // Too often worse... - m := matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1) - best = bestOf(best, &m) + improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1) } // See if we can find a better match by checking where the current best ends. // Use that offset to see if we can find a better full match. @@ -284,13 +276,10 @@ encodeLoop: // For this compression level 2 yields the best results. const skipBeginning = 2 if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 { - m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1) - bestEnd := bestOf(best, &m) + improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1) if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 { - m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1) - bestEnd = bestOf(bestEnd, &m) + improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1) } - best = bestEnd } } }