From 80b543bfe6925262225ebfa7c9a82a8c06b29621 Mon Sep 17 00:00:00 2001
From: greatroar <61184462+greatroar@users.noreply.github.com>
Date: Mon, 13 Mar 2023 08:46:10 +0100
Subject: [PATCH] zstd: Speed up + improve best encoder (#776)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

name                              old speed      new speed      delta
Encoder_EncodeAllSimple/best-8    14.8MB/s ± 3%  20.7MB/s ± 3%   +39.53%  (p=0.000 n=17+19)
Encoder_EncodeAllSimple4K/best-8  11.8MB/s ± 1%  19.2MB/s ± 6%   +62.17%  (p=0.000 n=20+20)

name                              old alloc/op   new alloc/op   delta
Encoder_EncodeAllSimple/best-8       14.0B ± 0%     10.2B ± 8%   -27.07%  (p=0.000 n=16+19)
Encoder_EncodeAllSimple4K/best-8     1.00B ± 0%     0.00B       -100.00%  (p=0.000 n=20+19)

Also, compressing enwik9 takes 6.375% less wall clock time.

Output from silesia corpus and enwik9 is about .05% bigger, due to the
different order in which comparisons are done:

dickens    3222189    3220994 (× 0.99963)
enwik9   259699309  259846164 (× 1.00057)
mozilla   16912341   16912437 (× 1.00001)
mr         3505553    3502823 (× 0.99922)
nci        2289871    2306320 (× 1.00718)
ooffice    2896410    2896907 (× 1.00017)
osdb       3390871    3390548 (× 0.99990)
reymont    1656006    1657380 (× 1.00083)
samba      4326783    4329898 (× 1.00072)
sao        5416932    5416648 (× 0.99995)
webster    9966351    9972808 (× 1.00065)
xml         538378     542277 (× 1.00724)
x-ray      5733061    5733121 (× 1.00001)
total    319554055  319728325 (× 1.00055)

This is still smaller than before #705.
---
 zstd/enc_best.go | 65 ++++++++++++++++++++----------------------------
 1 file changed, 27 insertions(+), 38 deletions(-)

diff --git a/zstd/enc_best.go b/zstd/enc_best.go
index 830f5ba74a..07f657d36e 100644
--- a/zstd/enc_best.go
+++ b/zstd/enc_best.go
@@ -32,7 +32,6 @@ type match struct {
 	length int32
 	rep    int32
 	est    int32
-	_      [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes
 }
 
 const highScore = 25000
@@ -189,12 +188,6 @@ encodeLoop:
 			panic("offset0 was 0")
 		}
 
-		bestOf := func(a, b *match) *match {
-			if a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 {
-				return a
-			}
-			return b
-		}
 		const goodEnough = 100
 
 		nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
@@ -202,40 +195,41 @@ encodeLoop:
 		candidateL := e.longTable[nextHashL]
 		candidateS := e.table[nextHashS]
 
-		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
+		// Set m to a match at offset if it looks like that will improve compression.
+		improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
 			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
-				return match{s: s, est: highScore}
+				return
 			}
 			if debugAsserts {
 				if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
 					panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
 				}
 			}
-			m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
-			m.estBits(bitsPerByte)
-			return m
+			cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
+			cand.estBits(bitsPerByte)
+			if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
+				*m = cand
+			}
 		}
 
-		m1 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)
-		m2 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)
-		m3 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)
-		m4 := matchAt(candidateS.prev-e.cur, s, uint32(cv), -1)
-		best := bestOf(bestOf(&m1, &m2), bestOf(&m3, &m4))
+		best := match{s: s, est: highScore}
+		improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)
 
 		if canRepeat && best.length < goodEnough {
 			cv32 := uint32(cv >> 8)
 			spp := s + 1
-			m1 := matchAt(spp-offset1, spp, cv32, 1)
-			m2 := matchAt(spp-offset2, spp, cv32, 2)
-			m3 := matchAt(spp-offset3, spp, cv32, 3)
-			best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3))
+			improve(&best, spp-offset1, spp, cv32, 1)
+			improve(&best, spp-offset2, spp, cv32, 2)
+			improve(&best, spp-offset3, spp, cv32, 3)
 			if best.length > 0 {
 				cv32 = uint32(cv >> 24)
 				spp += 2
-				m1 := matchAt(spp-offset1, spp, cv32, 1)
-				m2 := matchAt(spp-offset2, spp, cv32, 2)
-				m3 := matchAt(spp-offset3, spp, cv32, 3)
-				best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3))
+				improve(&best, spp-offset1, spp, cv32, 1)
+				improve(&best, spp-offset2, spp, cv32, 2)
+				improve(&best, spp-offset3, spp, cv32, 3)
 			}
 		}
 		// Load next and check...
@@ -262,18 +256,16 @@ encodeLoop:
 			candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
 
 			// Short at s+1
-			m1 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)
+			improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
 			// Long at s+1, s+2
-			m2 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)
-			m3 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)
-			m4 := matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
-			m5 := matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
-			best = bestOf(bestOf(bestOf(best, &m1), &m2), bestOf(bestOf(&m3, &m4), &m5))
+			improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
+			improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+			improve(&best, candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
+			improve(&best, candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
 			if false {
 				// Short at s+3.
 				// Too often worse...
-				m := matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
-				best = bestOf(best, &m)
+				improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
 			}
 			// See if we can find a better match by checking where the current best ends.
 			// Use that offset to see if we can find a better full match.
@@ -284,13 +276,10 @@ encodeLoop:
 				// For this compression level 2 yields the best results.
 				const skipBeginning = 2
 				if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 {
-					m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
-					bestEnd := bestOf(best, &m)
+					improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
 					if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 {
-						m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
-						bestEnd = bestOf(bestEnd, &m)
+						improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
 					}
-					best = bestEnd
 				}
 			}
 		}