Skip to content

Commit

Permalink
Optimize absl::StrCat for integers
Browse files Browse the repository at this point in the history
Previous implementation used a lookup table of 200 bytes which, if not in L1 cache, could cause memory stalls and consumes 3-4 cache lines. This is suboptimal in real world scenarious. We are going to use a purely scalar version of merging 2/4/8 bytes together.

PiperOrigin-RevId: 533576619
Change-Id: Icd730d18536f7eb35979b62582f6edf86b786019
  • Loading branch information
Abseil Team authored and copybara-github committed May 19, 2023
1 parent aaf81ec commit 15d26cd
Showing 1 changed file with 143 additions and 102 deletions.
245 changes: 143 additions & 102 deletions absl/strings/numbers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

#include "absl/base/attributes.h"
#include "absl/base/internal/raw_logging.h"
#include "absl/base/optimization.h"
#include "absl/numeric/bits.h"
#include "absl/strings/ascii.h"
#include "absl/strings/charconv.h"
Expand Down Expand Up @@ -136,82 +137,123 @@ bool SimpleAtob(absl::string_view str, bool* out) {

namespace {

// Used to optimize printing a decimal number's final digit.
const char one_ASCII_final_digits[10][2] {
{'0', 0}, {'1', 0}, {'2', 0}, {'3', 0}, {'4', 0},
{'5', 0}, {'6', 0}, {'7', 0}, {'8', 0}, {'9', 0},
};
// Various routines to encode integers to strings.

// We split data encodings into a group of 2 digits, 4 digits, 8 digits as
// it's easier to combine powers of two into scalar arithmetic.

// Previous implementation used a lookup table of 200 bytes for every 2 bytes
// and it was memory bound, any L1 cache miss would result in a much slower
// result. When benchmarking with a cache eviction rate of several percent,
// this implementation proved to be better.

// These constants represent '00', '0000' and '00000000' as ascii strings in
// integers. We can add these numbers if we encode to bytes from 0 to 9. as
// 'i' = '0' + i for 0 <= i <= 9.
constexpr uint32_t kTwoZeroBytes = 0x0101 * '0';
constexpr uint64_t kFourZeroBytes = 0x01010101 * '0';
constexpr uint64_t kEightZeroBytes = 0x0101010101010101ull * '0';

// * 103 / 1024 is a division by 10 for values from 0 to 99. It's also a
// division of a structure [k takes 2 bytes][m takes 2 bytes], then * 103 / 1024
// will be [k / 10][m / 10]. It allows parallel division.
constexpr uint64_t kDivisionBy10Mul = 103u;
constexpr uint64_t kDivisionBy10Div = 1 << 10;

// * 10486 / 1048576 is a division by 100 for values from 0 to 9999.
constexpr uint64_t kDivisionBy100Mul = 10486u;
constexpr uint64_t kDivisionBy100Div = 1 << 20;

// Encode functions write the ASCII output of input `n` to `out_str`.
inline char* EncodeHundred(uint32_t n, char* out_str) {
int num_digits = static_cast<int>(n - 10) >> 8;
uint32_t base = kTwoZeroBytes;
uint32_t div10 = (n * kDivisionBy10Mul) / kDivisionBy10Div;
uint32_t mod10 = n - 10u * div10;
base += div10 + (mod10 << 8);
base >>= num_digits & 8;
memcpy(out_str, &base, 2);
return out_str + 2 + num_digits;
}

} // namespace
inline char* EncodeTenThousand(uint32_t n, char* out_str) {
// We split lower 2 digits and upper 2 digits of n into 2 byte consecutive
// blocks. 123 -> [\0\1][\0\23]. We divide by 10 both blocks
// (it's 1 division + zeroing upper bits), and compute modulo 10 as well "in
// parallel". Then we combine both results to have both ASCII digits,
// strip trailing zeros, add ASCII '0000' and return.
uint32_t div100 = (n * kDivisionBy100Mul) / kDivisionBy100Div;
uint32_t mod100 = n - 100ull * div100;
uint32_t hundreds = (mod100 << 16) + div100;
uint32_t tens = (hundreds * kDivisionBy10Mul) / kDivisionBy10Div;
tens &= (0xFull << 16) | 0xFull;
tens += (hundreds - 10ull * tens) << 8;
ABSL_ASSUME(tens != 0);
// The result can contain trailing zero bits, we need to strip them to a first
// significant byte in a final representation. For example, for n = 123, we
// have tens to have representation \0\1\2\3. We do `& -8` to round
// to a multiple to 8 to strip zero bytes, not all zero bits.
// countr_zero to help.
// 0 minus 8 to make MSVC happy.
uint32_t zeroes = static_cast<uint32_t>(absl::countr_zero(tens)) & (0 - 8ull);
tens += kFourZeroBytes;
tens >>= zeroes;
memcpy(out_str, &tens, sizeof(tens));
return out_str + sizeof(tens) - zeroes / 8;
}

char* numbers_internal::FastIntToBuffer(uint32_t i, char* buffer) {
uint32_t digits;
// The idea of this implementation is to trim the number of divides to as few
// as possible, and also reducing memory stores and branches, by going in
// steps of two digits at a time rather than one whenever possible.
// The huge-number case is first, in the hopes that the compiler will output
// that case in one branch-free block of code, and only output conditional
// branches into it from below.
if (i >= 1000000000) { // >= 1,000,000,000
digits = i / 100000000; // 100,000,000
i -= digits * 100000000;
PutTwoDigits(digits, buffer);
buffer += 2;
lt100_000_000:
digits = i / 1000000; // 1,000,000
i -= digits * 1000000;
PutTwoDigits(digits, buffer);
buffer += 2;
lt1_000_000:
digits = i / 10000; // 10,000
i -= digits * 10000;
PutTwoDigits(digits, buffer);
buffer += 2;
lt10_000:
digits = i / 100;
i -= digits * 100;
PutTwoDigits(digits, buffer);
buffer += 2;
lt100:
digits = i;
PutTwoDigits(digits, buffer);
buffer += 2;
*buffer = 0;
return buffer;
}
// Prepare functions return an integer that should be written to out_str
// (but possibly include trailing zeros).
// For hi < 10000, lo < 10000 returns uint64_t as encoded in ASCII with
// possibly trailing zeroes of the number hi * 10000 + lo.
inline uint64_t PrepareTenThousands(uint64_t hi, uint64_t lo) {
uint64_t merged = hi | (lo << 32);
uint64_t div100 = ((merged * kDivisionBy100Mul) / kDivisionBy100Div) &
((0x7Full << 32) | 0x7Full);
uint64_t mod100 = merged - 100ull * div100;
uint64_t hundreds = (mod100 << 16) + div100;
uint64_t tens = (hundreds * kDivisionBy10Mul) / kDivisionBy10Div;
tens &= (0xFull << 48) | (0xFull << 32) | (0xFull << 16) | 0xFull;
tens += (hundreds - 10ull * tens) << 8;
return tens;
}

if (i < 100) {
digits = i;
if (i >= 10) goto lt100;
memcpy(buffer, one_ASCII_final_digits[i], 2);
return buffer + 1;
}
if (i < 10000) { // 10,000
if (i >= 1000) goto lt10_000;
digits = i / 100;
i -= digits * 100;
*buffer++ = '0' + static_cast<char>(digits);
goto lt100;
inline char* EncodeFullU32(uint32_t n, char* out_str) {
if (n < 100'000'000) {
uint64_t bottom = PrepareTenThousands(n / 10000, n % 10000);
ABSL_ASSUME(bottom != 0);
// 0 minus 8 to make MSVC happy.
uint32_t zeroes = static_cast<uint32_t>(absl::countr_zero(bottom))
& (0 - 8ull);
uint64_t bottom_res = bottom + kEightZeroBytes;
bottom_res >>= zeroes;
memcpy(out_str, &bottom_res, sizeof(bottom));
return out_str + sizeof(bottom) - zeroes / 8;
}
if (i < 1000000) { // 1,000,000
if (i >= 100000) goto lt1_000_000;
digits = i / 10000; // 10,000
i -= digits * 10000;
*buffer++ = '0' + static_cast<char>(digits);
goto lt10_000;
uint32_t top = n / 100'000'000;
n %= 100'000'000;
uint64_t bottom = PrepareTenThousands(n / 10000, n % 10000);
uint64_t bottom_res = bottom + kEightZeroBytes;
out_str = EncodeHundred(top, out_str);
memcpy(out_str, &bottom_res, sizeof(bottom));
return out_str + sizeof(bottom);
}

} // namespace

char* numbers_internal::FastIntToBuffer(uint32_t n, char* out_str) {
if (n < 100) {
out_str = EncodeHundred(n, out_str);
goto set_last_zero;
}
if (i < 100000000) { // 100,000,000
if (i >= 10000000) goto lt100_000_000;
digits = i / 1000000; // 1,000,000
i -= digits * 1000000;
*buffer++ = '0' + static_cast<char>(digits);
goto lt1_000_000;
if (n < 10000) {
out_str = EncodeTenThousand(n, out_str);
goto set_last_zero;
}
// we already know that i < 1,000,000,000
digits = i / 100000000; // 100,000,000
i -= digits * 100000000;
*buffer++ = '0' + static_cast<char>(digits);
goto lt100_000_000;
out_str = EncodeFullU32(n, out_str);
set_last_zero:
*out_str = '\0';
return out_str;
}

char* numbers_internal::FastIntToBuffer(int32_t i, char* buffer) {
Expand All @@ -230,41 +272,40 @@ char* numbers_internal::FastIntToBuffer(uint64_t i, char* buffer) {
uint32_t u32 = static_cast<uint32_t>(i);
if (u32 == i) return numbers_internal::FastIntToBuffer(u32, buffer);

// Here we know i has at least 10 decimal digits.
uint64_t top_1to11 = i / 1000000000;
u32 = static_cast<uint32_t>(i - top_1to11 * 1000000000);
uint32_t top_1to11_32 = static_cast<uint32_t>(top_1to11);
// 10**9 < 2**32 <= i < 10**10, we can do 2+8
uint64_t div08 = i / 100'000'000ull;
uint64_t mod08 = i % 100'000'000ull;
uint64_t mod_result =
PrepareTenThousands(mod08 / 10000, mod08 % 10000) + kEightZeroBytes;
if (i < 10'000'000'000ull) {
buffer = EncodeHundred(static_cast<uint32_t>(div08), buffer);
memcpy(buffer, &mod_result, 8);
buffer += 8;
goto set_last_zero;
}

if (top_1to11_32 == top_1to11) {
buffer = numbers_internal::FastIntToBuffer(top_1to11_32, buffer);
// i < 10**16, in this case 8+8
if (i < 10'000'000'000'000'000ull) {
buffer = EncodeFullU32(static_cast<uint32_t>(div08), buffer);
memcpy(buffer, &mod_result, 8);
buffer += 8;
goto set_last_zero;
} else {
// top_1to11 has more than 32 bits too; print it in two steps.
uint32_t top_8to9 = static_cast<uint32_t>(top_1to11 / 100);
uint32_t mid_2 = static_cast<uint32_t>(top_1to11 - top_8to9 * 100);
buffer = numbers_internal::FastIntToBuffer(top_8to9, buffer);
PutTwoDigits(mid_2, buffer);
buffer += 2;
// 4 + 8 + 8
uint64_t div016 = i / 10'000'000'000'000'000ull;
buffer = EncodeTenThousand(static_cast<uint32_t>(div016), buffer);
uint64_t mid_result = div08 - div016 * 100'000'000ull;
mid_result = PrepareTenThousands(mid_result / 10000, mid_result % 10000) +
kEightZeroBytes;
memcpy(buffer, &mid_result, 8);
buffer += 8;
memcpy(buffer, &mod_result, 8);
buffer += 8;
goto set_last_zero;
}

// We have only 9 digits now, again the maximum uint32_t can handle fully.
uint32_t digits = u32 / 10000000; // 10,000,000
u32 -= digits * 10000000;
PutTwoDigits(digits, buffer);
buffer += 2;
digits = u32 / 100000; // 100,000
u32 -= digits * 100000;
PutTwoDigits(digits, buffer);
buffer += 2;
digits = u32 / 1000; // 1,000
u32 -= digits * 1000;
PutTwoDigits(digits, buffer);
buffer += 2;
digits = u32 / 10;
u32 -= digits * 10;
PutTwoDigits(digits, buffer);
buffer += 2;
memcpy(buffer, one_ASCII_final_digits[u32], 2);
return buffer + 1;
set_last_zero:
*buffer = '\0';
return buffer;
}

char* numbers_internal::FastIntToBuffer(int64_t i, char* buffer) {
Expand Down

0 comments on commit 15d26cd

Please sign in to comment.