From 2491c65937d42561ef9238422929213751adcc10 Mon Sep 17 00:00:00 2001 From: Jun He Date: Mon, 23 May 2022 14:25:10 +0800 Subject: [PATCH] dec: adjust seqSymbol load on aarch64 ZSTD_seqSymbol is a structure with total of 64 bits wide. So it can be loaded in one operation and extract its fields by simply shifting or extracting on aarch64. GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh operations that cause performance drop. With this change it is observed 2~4% uplift of silesia and 2.5~6% of cantrbry @L8 on Arm N1. Signed-off-by: Jun He Change-Id: I7748909204cf78a17eb9d4f2333692d53239daa8 --- lib/decompress/zstd_decompress_block.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 7c046dab506..466e83b6be1 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -1170,9 +1170,27 @@ FORCE_INLINE_TEMPLATE seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) { seq_t seq; + /* + * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be + * loaded in one operation and extracted its fields by simply shifting or + * bit-extracting on aarch64. + * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh + * operations that cause performance drop. This can be avoided by using this + * ZSTD_memcpy hack. + */ +#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) + ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; + ZSTD_seqSymbol* const llDInfo = &llDInfoS; + ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; + ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; + ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); + ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); + ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); +#else const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; +#endif seq.matchLength = mlDInfo->baseValue; seq.litLength = llDInfo->baseValue; { U32 const ofBase = ofDInfo->baseValue;