@@ -1004,7 +1004,26 @@ void MemMove(ptrdiff_t dst, const void* src, size_t size) {
10041004}
10051005
10061006SNAPPY_ATTRIBUTE_ALWAYS_INLINE
1007- size_t AdvanceToNextTag (const uint8_t ** ip_p, size_t * tag) {
1007+ size_t AdvanceToNextTagARMOptimized (const uint8_t ** ip_p, size_t * tag) {
1008+ const uint8_t *& ip = *ip_p;
1009+ // This section is crucial for the throughput of the decompression loop.
1010+ // The latency of an iteration is fundamentally constrained by the
1011+ // following data chain on ip.
1012+ // ip -> c = Load(ip) -> delta1 = (c & 3) -> ip += delta1 or delta2
1013+ // delta2 = ((c >> 2) + 1) ip++
1014+ // This is different from X86 optimizations because ARM has conditional add
1015+ // instruction (csinc) and it removes several register moves.
1016+ const size_t literal_tag_offset = (*tag >> 2 ) + 1 ;
1017+ const size_t tag_type = *tag & 3 ;
1018+ const bool is_literal = (tag_type == 0 );
1019+ *tag = is_literal ? ip[literal_tag_offset] : ip[tag_type];
1020+ ip += is_literal ? literal_tag_offset : tag_type;
1021+ ip++;
1022+ return tag_type;
1023+ }
1024+
1025+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
1026+ size_t AdvanceToNextTagX86Optimized (const uint8_t ** ip_p, size_t * tag) {
10081027 const uint8_t *& ip = *ip_p;
10091028 // This section is crucial for the throughput of the decompression loop.
10101029 // The latency of an iteration is fundamentally constrained by the
@@ -1084,7 +1103,11 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
10841103 // For literals tag_type = 0, hence we will always obtain 0 from
10851104 // ExtractLowBytes. For literals offset will thus be kLiteralOffset.
10861105 ptrdiff_t len_min_offset = table.length_minus_offset [tag];
1087- size_t tag_type = AdvanceToNextTag (&ip, &tag);
1106+ #if defined(__aarch64__)
1107+ size_t tag_type = AdvanceToNextTagARMOptimized (&ip, &tag);
1108+ #else
1109+ size_t tag_type = AdvanceToNextTagX86Optimized (&ip, &tag);
1110+ #endif
10881111 uint32_t next = LittleEndian::Load32 (old_ip);
10891112 size_t len = len_min_offset & 0xFF ;
10901113 len_min_offset -= ExtractOffset (next, tag_type);
0 commit comments