Skip to content

Commit 9cdf0bc

Browse files
committed
[lazy] Use switch instead of indirect function calls.
Use a switch statement to select the search function instead of an indirect function call. This results in a sizable performance win. This PR is a modification of the approach taken in PR #2828. When I measured performance for that commit, it was neutral. However, I now see a performance regression on gcc, but still neutral on clang. I'm measuring on the same platform, but with newer compilers. The new approach beats both the current dev branch and the baseline before PR #2828 was merged. This PR is necessary for Issue #3275, to update zstd in the kernel. Without this PR there is a large regression in greedy - btlazy2 compression speed. With this PR it is about neutral. gcc version: 12.2.0 clang version: 14.0.6 dataset: silesia.tar | Compiler | Level | Dev Speed (MB/s) | PR Speed (MB/s) | Delta | |----------|-------|------------------|-----------------|--------| | gcc | 5 | 102.6 | 113.7 | +10.8% | | gcc | 7 | 66.6 | 74.8 | +12.3% | | gcc | 9 | 51.5 | 58.9 | +14.3% | | gcc | 13 | 14.3 | 14.3 | +0.0% | | clang | 5 | 108.1 | 114.8 | +6.2% | | clang | 7 | 68.5 | 72.3 | +5.5% | | clang | 9 | 53.2 | 56.2 | +5.6% | | clang | 13 | 14.3 | 14.7 | +2.8% | The binary size stays just about the same for clang and gcc, measured using the `size` command: | Compiler | Branch | Text | Data | BSS | Total | |----------|--------|---------|------|-----|---------| | gcc | dev | 1127950 | 3312 | 280 | 1131542 | | gcc | PR | 1123422 | 2512 | 280 | 1126214 | | clang | dev | 1046254 | 3256 | 216 | 1049726 | | clang | PR | 1048198 | 2296 | 216 | 1050710 |
1 parent 5c1cdba commit 9cdf0bc

File tree

1 file changed

+120
-102
lines changed

1 file changed

+120
-102
lines changed

lib/compress/zstd_lazy.c

+120-102
Original file line numberDiff line numberDiff line change
@@ -1317,14 +1317,10 @@ size_t ZSTD_RowFindBestMatch(
13171317
}
13181318

13191319

1320-
typedef size_t (*searchMax_f)(
1321-
ZSTD_matchState_t* ms,
1322-
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1323-
13241320
/**
1325-
* This struct contains the functions necessary for lazy to search.
1326-
* Currently, that is only searchMax. However, it is still valuable to have the
1327-
* VTable because this makes it easier to add more functions to the VTable later.
1321+
* Generate search functions templated on (dictMode, mls, rowLog).
1322+
* These functions are outlined for code size & compilation time.
1323+
* ZSTD_searchMax() dispatches to the correct implementation function.
13281324
*
13291325
* TODO: The start of the search function involves loading and calculating a
13301326
* bunch of constants from the ZSTD_matchState_t. These computations could be
@@ -1342,38 +1338,35 @@ typedef size_t (*searchMax_f)(
13421338
* the single segment loop. It should go in searchMax instead of its own
13431339
* function to avoid having multiple virtual function calls per search.
13441340
*/
1345-
typedef struct {
1346-
searchMax_f searchMax;
1347-
} ZSTD_LazyVTable;
13481341

1349-
#define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1350-
static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1342+
#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1343+
#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1344+
#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1345+
1346+
#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1347+
1348+
#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1349+
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
13511350
ZSTD_matchState_t* ms, \
13521351
const BYTE* ip, const BYTE* const iLimit, \
13531352
size_t* offBasePtr) \
13541353
{ \
13551354
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
13561355
return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode);\
13571356
} \
1358-
static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1359-
ZSTD_BtFindBestMatch_##dictMode##_##mls \
1360-
};
13611357

1362-
#define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1363-
static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1358+
#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1359+
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
13641360
ZSTD_matchState_t* ms, \
13651361
const BYTE* ip, const BYTE* const iLimit, \
13661362
size_t* offsetPtr) \
13671363
{ \
13681364
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
13691365
return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
13701366
} \
1371-
static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1372-
ZSTD_HcFindBestMatch_##dictMode##_##mls \
1373-
};
13741367

1375-
#define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1376-
static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1368+
#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1369+
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
13771370
ZSTD_matchState_t* ms, \
13781371
const BYTE* ip, const BYTE* const iLimit, \
13791372
size_t* offsetPtr) \
@@ -1382,9 +1375,6 @@ typedef struct {
13821375
assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
13831376
return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
13841377
} \
1385-
static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1386-
ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1387-
};
13881378

13891379
#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
13901380
X(dictMode, mls, 4) \
@@ -1407,84 +1397,114 @@ typedef struct {
14071397
X(__VA_ARGS__, dictMatchState) \
14081398
X(__VA_ARGS__, dedicatedDictSearch)
14091399

1410-
/* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1411-
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1412-
/* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1413-
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1414-
/* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1415-
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1416-
1417-
#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1418-
{ \
1419-
&ZSTD_BtVTable_##dictMode##_4, \
1420-
&ZSTD_BtVTable_##dictMode##_5, \
1421-
&ZSTD_BtVTable_##dictMode##_6 \
1422-
}
1423-
1424-
#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1425-
{ \
1426-
&ZSTD_HcVTable_##dictMode##_4, \
1427-
&ZSTD_HcVTable_##dictMode##_5, \
1428-
&ZSTD_HcVTable_##dictMode##_6 \
1429-
}
1400+
/* Generate row search fns for each combination of (dictMode, mls, rowLog) */
1401+
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1402+
/* Generate binary Tree search fns for each combination of (dictMode, mls) */
1403+
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1404+
/* Generate hash chain search fns for each combination of (dictMode, mls) */
1405+
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1406+
typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
14301407

1431-
#define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1432-
{ \
1433-
&ZSTD_RowVTable_##dictMode##_##mls##_4, \
1434-
&ZSTD_RowVTable_##dictMode##_##mls##_5, \
1435-
&ZSTD_RowVTable_##dictMode##_##mls##_6 \
1408+
#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1409+
return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1410+
#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1411+
return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1412+
#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, rowLog, mls) \
1413+
return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1414+
1415+
#define ZSTD_SWITCH_MLS(X, ...) \
1416+
switch (mls) { \
1417+
default: assert(0); \
1418+
case 4: \
1419+
X(__VA_ARGS__, 4) \
1420+
break; \
1421+
case 5: \
1422+
X(__VA_ARGS__, 5) \
1423+
break; \
1424+
case 6: \
1425+
X(__VA_ARGS__, 6) \
1426+
break; \
14361427
}
14371428

1438-
#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1439-
{ \
1440-
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1441-
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1442-
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1429+
#define ZSTD_SWITCH_ROW_LOG(X, ...) \
1430+
switch (rowLog) { \
1431+
default: assert(0); \
1432+
case 4: \
1433+
ZSTD_SWITCH_MLS(X, __VA_ARGS__, 4) \
1434+
break; \
1435+
case 5: \
1436+
ZSTD_SWITCH_MLS(X, __VA_ARGS__, 5) \
1437+
break; \
1438+
case 6: \
1439+
ZSTD_SWITCH_MLS(X, __VA_ARGS__, 6) \
1440+
break; \
14431441
}
14441442

1445-
#define GEN_ZSTD_VTABLE_ARRAY(X) \
1446-
{ \
1447-
X(noDict), \
1448-
X(extDict), \
1449-
X(dictMatchState), \
1450-
X(dedicatedDictSearch) \
1443+
#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1444+
switch (searchMethod) { \
1445+
default: assert(0); \
1446+
case search_hashChain: \
1447+
ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1448+
break; \
1449+
case search_binaryTree: \
1450+
ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1451+
break; \
1452+
case search_rowHash: \
1453+
ZSTD_SWITCH_ROW_LOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode) \
1454+
break; \
14511455
}
14521456

1453-
/* *******************************
1454-
* Common parser - lazy strategy
1455-
*********************************/
1456-
typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1457-
14581457
/**
1459-
* This table is indexed first by the four ZSTD_dictMode_e values, and then
1460-
* by the two searchMethod_e values. NULLs are placed for configurations
1461-
* that should never occur (extDict modes go to the other implementation
1462-
* below and there is no DDSS for binary tree search yet).
1458+
* Searches for the longest match at @p ip.
1459+
* Dispatches to the correct implementation function based on the
1460+
* (searchMethod, dictMode, mls, rowLog). We use switch statements
1461+
* here instead of using an indirect function call through a function
1462+
* pointer because after Spectre and Meltdown mitigations, indirect
1463+
* function calls can be very costly, especially in the kernel.
1464+
*
1465+
* NOTE: dictMode and searchMethod should be templated, so those switch
1466+
* statements should be optimized out. Only the mls & rowLog switches
1467+
* should be left.
1468+
*
1469+
* @param ms The match state.
1470+
* @param ip The position to search at.
1471+
* @param iend The end of the input data.
1472+
* @param[out] offsetPtr Stores the match offset into this pointer.
1473+
* @param mls The minimum search length, in the range [4, 6].
1474+
* @param rowLog The row log (if applicable), in the range [4, 6].
1475+
* @param searchMethod The search method to use (templated).
1476+
* @param dictMode The dictMode (templated).
1477+
*
1478+
* @returns The length of the longest match found, or < mls if no match is found.
1479+
* If a match is found its offset is stored in @p offsetPtr.
14631480
*/
1464-
1465-
static ZSTD_LazyVTable const*
1466-
ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1481+
FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1482+
ZSTD_matchState_t* ms,
1483+
const BYTE* ip,
1484+
const BYTE* iend,
1485+
size_t* offsetPtr,
1486+
U32 const mls,
1487+
U32 const rowLog,
1488+
searchMethod_e const searchMethod,
1489+
ZSTD_dictMode_e const dictMode)
14671490
{
1468-
/* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1469-
ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1470-
ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1471-
/* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1472-
ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1473-
1474-
U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1475-
U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1476-
switch (searchMethod) {
1477-
case search_hashChain:
1478-
return hcVTables[dictMode][mls - 4];
1479-
case search_binaryTree:
1480-
return btVTables[dictMode][mls - 4];
1481-
case search_rowHash:
1482-
return rowVTables[dictMode][mls - 4][rowLog - 4];
1483-
default:
1484-
return NULL;
1491+
if (dictMode == ZSTD_noDict) {
1492+
ZSTD_SWITCH_SEARCH_METHOD(noDict)
1493+
} else if (dictMode == ZSTD_extDict) {
1494+
ZSTD_SWITCH_SEARCH_METHOD(extDict)
1495+
} else if (dictMode == ZSTD_dictMatchState) {
1496+
ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1497+
} else if (dictMode == ZSTD_dedicatedDictSearch) {
1498+
ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
14851499
}
1500+
assert(0);
1501+
return 0;
14861502
}
14871503

1504+
/* *******************************
1505+
* Common parser - lazy strategy
1506+
*********************************/
1507+
14881508
FORCE_INLINE_TEMPLATE size_t
14891509
ZSTD_compressBlock_lazy_generic(
14901510
ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1501,8 +1521,9 @@ ZSTD_compressBlock_lazy_generic(
15011521
const BYTE* const base = ms->window.base;
15021522
const U32 prefixLowestIndex = ms->window.dictLimit;
15031523
const BYTE* const prefixLowest = base + prefixLowestIndex;
1524+
const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1525+
const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
15041526

1505-
searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
15061527
U32 offset_1 = rep[0], offset_2 = rep[1];
15071528
U32 offsetSaved1 = 0, offsetSaved2 = 0;
15081529

@@ -1519,8 +1540,6 @@ ZSTD_compressBlock_lazy_generic(
15191540
0;
15201541
const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
15211542

1522-
assert(searchMax != NULL);
1523-
15241543
DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
15251544
ip += (dictAndPrefixLength == 0);
15261545
if (dictMode == ZSTD_noDict) {
@@ -1538,7 +1557,6 @@ ZSTD_compressBlock_lazy_generic(
15381557
}
15391558

15401559
if (searchMethod == search_rowHash) {
1541-
const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
15421560
ZSTD_row_fillHashCache(ms, base, rowLog,
15431561
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
15441562
ms->nextToUpdate, ilimit);
@@ -1579,7 +1597,7 @@ ZSTD_compressBlock_lazy_generic(
15791597

15801598
/* first search (depth 0) */
15811599
{ size_t offbaseFound = 999999999;
1582-
size_t const ml2 = searchMax(ms, ip, iend, &offbaseFound);
1600+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
15831601
if (ml2 > matchLength)
15841602
matchLength = ml2, start = ip, offBase = offbaseFound;
15851603
}
@@ -1618,7 +1636,7 @@ ZSTD_compressBlock_lazy_generic(
16181636
}
16191637
}
16201638
{ size_t ofbCandidate=999999999;
1621-
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
1639+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
16221640
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
16231641
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
16241642
if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -1654,7 +1672,7 @@ ZSTD_compressBlock_lazy_generic(
16541672
}
16551673
}
16561674
{ size_t ofbCandidate=999999999;
1657-
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
1675+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
16581676
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
16591677
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
16601678
if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -1899,9 +1917,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
18991917
const BYTE* const dictEnd = dictBase + dictLimit;
19001918
const BYTE* const dictStart = dictBase + ms->window.lowLimit;
19011919
const U32 windowLog = ms->cParams.windowLog;
1902-
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1920+
const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1921+
const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
19031922

1904-
searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
19051923
U32 offset_1 = rep[0], offset_2 = rep[1];
19061924

19071925
DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
@@ -1943,7 +1961,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
19431961

19441962
/* first search (depth 0) */
19451963
{ size_t ofbCandidate = 999999999;
1946-
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
1964+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
19471965
if (ml2 > matchLength)
19481966
matchLength = ml2, start = ip, offBase = ofbCandidate;
19491967
}
@@ -1978,7 +1996,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
19781996

19791997
/* search match, depth 1 */
19801998
{ size_t ofbCandidate = 999999999;
1981-
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
1999+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
19822000
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
19832001
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
19842002
if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -2010,7 +2028,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
20102028

20112029
/* search match, depth 2 */
20122030
{ size_t ofbCandidate = 999999999;
2013-
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
2031+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
20142032
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
20152033
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
20162034
if ((ml2 >= 4) && (gain2 > gain1)) {

0 commit comments

Comments
 (0)