Skip to content

Commit b4888f7

Browse files
Snappy Teampwnall
authored andcommitted
Optimize tag extraction for ARM with conditional increment instruction generation (csinc). For codegen see https://gcc.godbolt.org/z/a8z9j95Pv
PiperOrigin-RevId: 382688740
1 parent b3fb0b5 commit b4888f7

File tree

1 file changed

+25
-2
lines changed

1 file changed

+25
-2
lines changed

snappy.cc

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,7 +1004,26 @@ void MemMove(ptrdiff_t dst, const void* src, size_t size) {
10041004
}
10051005

10061006
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
1007-
size_t AdvanceToNextTag(const uint8_t** ip_p, size_t* tag) {
1007+
size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
1008+
const uint8_t*& ip = *ip_p;
1009+
// This section is crucial for the throughput of the decompression loop.
1010+
// The latency of an iteration is fundamentally constrained by the
1011+
// following data chain on ip.
1012+
// ip -> c = Load(ip) -> delta1 = (c & 3) -> ip += delta1 or delta2
1013+
// delta2 = ((c >> 2) + 1) ip++
1014+
// This is different from X86 optimizations because ARM has conditional add
1015+
// instruction (csinc) and it removes several register moves.
1016+
const size_t literal_tag_offset = (*tag >> 2) + 1;
1017+
const size_t tag_type = *tag & 3;
1018+
const bool is_literal = (tag_type == 0);
1019+
*tag = is_literal ? ip[literal_tag_offset] : ip[tag_type];
1020+
ip += is_literal ? literal_tag_offset : tag_type;
1021+
ip++;
1022+
return tag_type;
1023+
}
1024+
1025+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
1026+
size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) {
10081027
const uint8_t*& ip = *ip_p;
10091028
// This section is crucial for the throughput of the decompression loop.
10101029
// The latency of an iteration is fundamentally constrained by the
@@ -1084,7 +1103,11 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
10841103
// For literals tag_type = 0, hence we will always obtain 0 from
10851104
// ExtractLowBytes. For literals offset will thus be kLiteralOffset.
10861105
ptrdiff_t len_min_offset = table.length_minus_offset[tag];
1087-
size_t tag_type = AdvanceToNextTag(&ip, &tag);
1106+
#if defined(__aarch64__)
1107+
size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag);
1108+
#else
1109+
size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag);
1110+
#endif
10881111
uint32_t next = LittleEndian::Load32(old_ip);
10891112
size_t len = len_min_offset & 0xFF;
10901113
len_min_offset -= ExtractOffset(next, tag_type);

0 commit comments

Comments
 (0)