Skip to content

Commit 56ead60

Browse files
committed
improve compression ratio of small alphabets
fix #3328 In situations where the alphabet size is very small, the evaluation of literal costs from the Optimal Parser is initially incorrect. It takes some time to converge, during which compression is less efficient. This is especially important for small files, because there will not be enough data to converge, so most of the parsing is selected based on incorrect metrics. After this patch, the scenario ##3328 gets fixed, delivering the expected 29 bytes compressed size (smallest known compressed size).
1 parent 0694f14 commit 56ead60

File tree

5 files changed

+76
-40
lines changed

5 files changed

+76
-40
lines changed

lib/compress/huf_compress.c

+10-1
Original file line numberDiff line numberDiff line change
@@ -1253,9 +1253,17 @@ unsigned HUF_minTableLog(unsigned symbolCardinality)
12531253
return minBitsSymbols;
12541254
}
12551255

1256-
unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, size_t wkspSize, HUF_CElt* table, const unsigned* count, HUF_depth_mode depthMode)
1256+
unsigned HUF_optimalTableLog(
1257+
unsigned maxTableLog,
1258+
size_t srcSize,
1259+
unsigned maxSymbolValue,
1260+
void* workSpace, size_t wkspSize,
1261+
HUF_CElt* table,
1262+
const unsigned* count,
1263+
HUF_depth_mode depthMode)
12571264
{
12581265
unsigned optLog = FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
1266+
DEBUGLOG(6, "HUF_optimalTableLog (srcSize=%zu)", srcSize);
12591267
assert(srcSize > 1); /* Not supported, RLE should be used instead */
12601268

12611269
if (depthMode == HUF_depth_optimal) { /** Test valid depths and return optimal **/
@@ -1269,6 +1277,7 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS
12691277
if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) return optLog;
12701278

12711279
for (huffLog = HUF_minTableLog(symbolCardinality); huffLog <= maxTableLog; huffLog++) {
1280+
DEBUGLOG(7, "checking for huffLog=%u", huffLog);
12721281
maxBits = HUF_buildCTable_wksp(table, count,
12731282
maxSymbolValue, huffLog,
12741283
workSpace, wkspSize);

lib/compress/zstd_compress.c

+5-6
Original file line numberDiff line numberDiff line change
@@ -2661,15 +2661,14 @@ ZSTD_entropyCompressSeqStore_internal(
26612661
unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
26622662
size_t const litSize = (size_t)(seqStorePtr->lit - literals);
26632663

2664-
HUF_depth_mode depthMode = cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_depth_optimal : HUF_depth_fast;
26652664
size_t const cSize = ZSTD_compressLiterals(
2666-
&prevEntropy->huf, &nextEntropy->huf,
2667-
cctxParams->cParams.strategy,
2668-
ZSTD_literalsCompressionIsDisabled(cctxParams),
26692665
op, dstCapacity,
26702666
literals, litSize,
26712667
entropyWorkspace, entropyWkspSize,
2672-
bmi2, suspectUncompressible, depthMode);
2668+
&prevEntropy->huf, &nextEntropy->huf,
2669+
cctxParams->cParams.strategy,
2670+
ZSTD_literalsCompressionIsDisabled(cctxParams),
2671+
suspectUncompressible, bmi2);
26732672
FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
26742673
assert(cSize <= dstCapacity);
26752674
op += cSize;
@@ -3688,12 +3687,12 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
36883687
size_t estimatedSecondHalfSize;
36893688
size_t midIdx = (startIdx + endIdx)/2;
36903689

3690+
DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
36913691
assert(endIdx >= startIdx);
36923692
if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
36933693
DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
36943694
return;
36953695
}
3696-
DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
36973696
ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
36983697
ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
36993698
ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);

lib/compress/zstd_compress_literals.c

+39-18
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,37 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
9292
return flSize+1;
9393
}
9494

95-
size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
96-
ZSTD_hufCTables_t* nextHuf,
97-
ZSTD_strategy strategy, int disableLiteralCompression,
98-
void* dst, size_t dstCapacity,
99-
const void* src, size_t srcSize,
100-
void* entropyWorkspace, size_t entropyWorkspaceSize,
101-
const int bmi2,
102-
unsigned suspectUncompressible, HUF_depth_mode depthMode)
95+
/* ZSTD_minLiteralsToCompress() :
96+
* returns minimal amount of literals
97+
* for literal compression to even be attempted.
98+
* Minimum is made tighter as compression strategy increases.
99+
*/
100+
static size_t
101+
ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
102+
{
103+
assert((int)strategy >= 0);
104+
assert((int)strategy <= 9);
105+
/* btultra2 : min 8 bytes;
106+
* then 2x larger for each successive compression strategy
107+
* max threshold 64 bytes */
108+
{ int const shift = MIN(9-strategy, 3);
109+
size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : 8 << shift;
110+
DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
111+
return mintc;
112+
}
113+
}
114+
115+
size_t ZSTD_compressLiterals (
116+
void* dst, size_t dstCapacity,
117+
const void* src, size_t srcSize,
118+
void* entropyWorkspace, size_t entropyWorkspaceSize,
119+
const ZSTD_hufCTables_t* prevHuf,
120+
ZSTD_hufCTables_t* nextHuf,
121+
ZSTD_strategy strategy,
122+
int disableLiteralCompression,
123+
int suspectUncompressible,
124+
int bmi2)
103125
{
104-
size_t const minGain = ZSTD_minGain(srcSize, strategy);
105126
size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
106127
BYTE* const ostart = (BYTE*)dst;
107128
U32 singleStream = srcSize < 256;
@@ -119,15 +140,14 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
119140
if (disableLiteralCompression)
120141
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
121142

122-
/* small ? don't even attempt compression (speed opt) */
123-
# define COMPRESS_LITERALS_SIZE_MIN 63
124-
{ size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
125-
if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
126-
}
143+
/* if too small, don't even attempt compression (speed opt) */
144+
if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
145+
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
127146

128147
RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
129148
{ HUF_repeat repeat = prevHuf->repeatMode;
130149
int const preferRepeat = (strategy < ZSTD_lazy) ? srcSize <= 1024 : 0;
150+
HUF_depth_mode const depthMode = (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD) ? HUF_depth_optimal : HUF_depth_fast;
131151
typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int, int, unsigned, HUF_depth_mode);
132152
huf_compress_f huf_compress;
133153
if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
@@ -146,10 +166,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
146166
}
147167
}
148168

149-
if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
150-
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
151-
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
152-
}
169+
{ size_t const minGain = ZSTD_minGain(srcSize, strategy);
170+
if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
171+
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
172+
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
173+
} }
153174
if (cLitSize==1) {
154175
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
155176
return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);

lib/compress/zstd_compress_literals.h

+11-7
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,18 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
1818

1919
size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
2020

21-
/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
22-
size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
23-
ZSTD_hufCTables_t* nextHuf,
24-
ZSTD_strategy strategy, int disableLiteralCompression,
25-
void* dst, size_t dstCapacity,
21+
/* ZSTD_compressLiterals():
22+
* @entropyWorkspace: must be aligned on 4-bytes boundaries
23+
* @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
24+
* @suspectUncompressible: sampling checks, to potentially skip huffman coding
25+
*/
26+
size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
2627
const void* src, size_t srcSize,
2728
void* entropyWorkspace, size_t entropyWorkspaceSize,
28-
const int bmi2,
29-
unsigned suspectUncompressible, HUF_depth_mode depthMode);
29+
const ZSTD_hufCTables_t* prevHuf,
30+
ZSTD_hufCTables_t* nextHuf,
31+
ZSTD_strategy strategy, int disableLiteralCompression,
32+
int suspectUncompressible,
33+
int bmi2);
3034

3135
#endif /* ZSTD_COMPRESS_LITERALS_H */

lib/compress/zstd_opt.c

+11-8
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
1717
#define ZSTD_MAX_PRICE (1<<30)
1818

19-
#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
19+
#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
2020

2121

2222
/*-*************************************
@@ -88,20 +88,24 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
8888
return total;
8989
}
9090

91-
static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
91+
typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
92+
93+
static U32
94+
ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
9295
{
9396
U32 s, sum=0;
9497
DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
9598
assert(shift < 30);
9699
for (s=0; s<lastEltIndex+1; s++) {
97-
table[s] = 1 + (table[s] >> shift);
100+
unsigned const base = base1 ? 1 : (table[s]>0);
101+
table[s] = base + (table[s] >> shift);
98102
sum += table[s];
99103
}
100104
return sum;
101105
}
102106

103107
/* ZSTD_scaleStats() :
104-
* reduce all elements in table is sum too large
108+
* reduce all elements in table if sum too large
105109
* return the resulting sum of elements */
106110
static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
107111
{
@@ -110,7 +114,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
110114
DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
111115
assert(logTarget < 30);
112116
if (factor <= 1) return prevsum;
113-
return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
117+
return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
114118
}
115119

116120
/* ZSTD_rescaleFreqs() :
@@ -188,13 +192,13 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
188192
optPtr->offCodeSum += optPtr->offCodeFreq[of];
189193
} }
190194

191-
} else { /* not a dictionary */
195+
} else { /* first block, no dictionary */
192196

193197
assert(optPtr->litFreq != NULL);
194198
if (compressedLiterals) {
195199
unsigned lit = MaxLit;
196200
HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */
197-
optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
201+
optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
198202
}
199203

200204
{ unsigned const baseLLfreqs[MaxLL+1] = {
@@ -224,7 +228,6 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
224228
optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
225229
}
226230

227-
228231
}
229232

230233
} else { /* new block : re-use previous statistics, scaled down */

0 commit comments

Comments
 (0)