-
Notifications
You must be signed in to change notification settings - Fork 2.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Speeding up write_significand() #2499
Conversation
4c90efb
to
ef667e7
Compare
ef667e7
to
4a54487
Compare
Could you post here the benchmark code that you used to obtained the timings? |
https://ideone.com/lTcMzk Here are tests of several options |
Execution result on my machine |
Copying here for future reference: #include <array>
#include <memory>
#include <cstdio>
#include <vector>
#include <random>
#include <chrono>
#include <cstring>
#define FMT_CONSTEXPR constexpr
template <typename T> constexpr auto make_checked(T* p, size_t) -> T* {
return p;
}
// Converts value in the range [0, 100) to a string.
constexpr const char* digits2(size_t value) {
// GCC generates slightly better code when value is pointer-size.
return &"0001020304050607080910111213141516171819"
"2021222324252627282930313233343536373839"
"4041424344454647484950515253545556575859"
"6061626364656667686970717273747576777879"
"8081828384858687888990919293949596979899"[value * 2];
}
// Casts a nonnegative integer to unsigned.
template <typename Int>
FMT_CONSTEXPR auto to_unsigned(Int value) ->
typename std::make_unsigned<Int>::type {
//FMT_ASSERT(value >= 0, "negative value");
return static_cast<typename std::make_unsigned<Int>::type>(value);
}
// Copies two characters from src to dst.
template <typename Char>
void copy2(Char* dst, const char* src) {
if (/*!is_constant_evaluated() && */sizeof(Char) == sizeof(char)) {
memcpy(dst, src, 2);
return;
}
*dst++ = static_cast<Char>(*src++);
*dst = static_cast<Char>(*src);
}
template <typename Iterator> struct format_decimal_result {
Iterator begin;
Iterator end;
};
// Formats a decimal unsigned integer value writing into out pointing to a
// buffer of specified size. The caller must ensure that the buffer is large
// enough.
template <typename Char, typename UInt>
FMT_CONSTEXPR auto format_decimal(Char* out, UInt value, int size)
-> format_decimal_result<Char*> {
//FMT_ASSERT(size >= count_digits(value), "invalid digit count");
out += size;
Char* end = out;
while (value >= 100) {
// Integer division is slow so do it for a group of two digits instead
// of for every digit. The idea comes from the talk by Alexandrescu
// "Three Optimization Tips for C++". See speed-test for a comparison.
out -= 2;
copy2(out, digits2(static_cast<size_t>(value % 100)));
value /= 100;
}
if (value < 10) {
*--out = static_cast<Char>('0' + value);
return {out, end};
}
out -= 2;
copy2(out, digits2(static_cast<size_t>(value)));
return {out, end};
}
template <typename Char, typename UInt/*,
FMT_ENABLE_IF(std::is_integral<UInt>::value)*/>
inline auto write_significand(Char* out, UInt significand, int significand_size,
int integral_size, Char decimal_point) -> Char* {
if (!decimal_point)
return format_decimal(out, significand, significand_size).end;
auto end = format_decimal(out + 1, significand, significand_size).end;
if (integral_size == 1) {
out[0] = out[1];
} else {
std::uninitialized_copy_n(out + 1, integral_size,
make_checked(out, to_unsigned(integral_size)));
}
out[integral_size] = decimal_point;
return end;
}
template <typename Char, typename UInt/*,
FMT_ENABLE_IF(std::is_integral<UInt>::value)*/>
inline auto write_significand2(Char* out, UInt significand, int significand_size,
int integral_size, Char decimal_point) -> Char* {
Char* ccc = out;
if (!decimal_point)
return format_decimal(out, significand, significand_size).end;
out += significand_size + 1;
Char* end = out;
int floating_size = significand_size - integral_size;
for(int i = floating_size / 2; i > 0; --i) {
out -= 2;
copy2(out, digits2(significand % 100));
significand /= 100;
}
if (floating_size % 2 != 0) {
const char* tmp = digits2(significand % 100);
significand /= 100;
*out-- = static_cast<Char>(*tmp++);
*out-- = decimal_point;
*out = static_cast<Char>(*tmp);
--integral_size;
} else {
*--out = decimal_point;
}
if (integral_size > 0)
format_decimal(out - integral_size, significand, integral_size);
return end;
}
template <typename Char, typename UInt/*,
FMT_ENABLE_IF(std::is_integral<UInt>::value)*/>
inline auto write_significand3(Char* out, UInt significand, int significand_size,
int integral_size, Char decimal_point) -> Char* {
if (!decimal_point)
return format_decimal(out, significand, significand_size).end;
out += significand_size + 1;
Char* end = out;
int floating_size = significand_size - integral_size;
for(int i = (floating_size + 1) / 2; i > 0; --i) {
out -= 2;
copy2(out, digits2(significand % 100));
significand /= 100;
}
if (floating_size % 2 != 0) {
Char tmp = *out;
*out = decimal_point;
*--out = tmp;
--integral_size;
} else {
*--out = decimal_point;
}
if (integral_size > 0)
format_decimal(out - integral_size, significand, integral_size);
return end;
}
template <typename Char, typename UInt/*,
FMT_ENABLE_IF(std::is_integral<UInt>::value)*/>
inline auto write_significand4(Char* out, UInt significand, int significand_size,
int integral_size, Char decimal_point) -> Char* {
if (!decimal_point)
return format_decimal(out, significand, significand_size).end;
out += significand_size + 1;
Char* end = out;
int floating_size = significand_size - integral_size;
for(int i = floating_size / 2; i > 0; --i) {
out -= 2;
copy2(out, digits2(significand % 100));
significand /= 100;
}
if (floating_size % 2 != 0) {
*--out = static_cast<Char>('0' + significand % 10);
significand /= 10;
}
*--out = decimal_point;
format_decimal(out - integral_size, significand, integral_size);
return end;
}
template <typename Char, typename UInt/*,
FMT_ENABLE_IF(std::is_integral<UInt>::value)*/>
inline auto write_significand5(Char* out, UInt significand, int significand_size,
int integral_size, Char decimal_point) -> Char* {
if (!decimal_point)
return format_decimal(out, significand, significand_size).end;
auto end = format_decimal(out + 1, significand, significand_size).end;
Char tmp = decimal_point;
for(Char* i = out + integral_size; i != out; --i) {
std::swap(tmp, *i);
}
*out = tmp;
/*for (int i=0; i< integral_size; ++i) {
out[i] = out[i + 1];
}*/
out[integral_size] = decimal_point;
return end;
}
std::vector<unsigned> genTestVector(int integral_size, int floating_size) {
std::uniform_int_distribution<unsigned> distrib1(
std::pow(10, integral_size - 1), std::pow(10, integral_size) - 1);
std::uniform_int_distribution<unsigned> distrib2(
std::pow(10, floating_size - 1), std::pow(10, floating_size) - 1);
std::random_device rd;
std::mt19937 gen(rd());
std::vector<unsigned> res;
const unsigned tmp = std::pow(10, floating_size);
for (int i = 0; i < 10000; ++i) {
res.push_back(distrib1(gen) * tmp + distrib2(gen));
}
return res;
}
template <class F>
std::chrono::steady_clock::duration test(const std::vector<unsigned> &vec,
int significand_size,
int integral_size, F &&f)
{
char buf[32];
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < 1000; ++i) {
for (unsigned tmp : vec) {
volatile char *x = f(buf, tmp, significand_size, integral_size);
}
}
auto end = std::chrono::steady_clock::now();
return end - start;
}
int main() {
printf("is, fs) speed\n");
for (int integral_size = 1; integral_size <= 4; ++integral_size) {
for (int floating_size = 1; floating_size <= 4; ++floating_size) {
auto vec = genTestVector(integral_size, floating_size);
int significand_size = integral_size + floating_size;
std::array<std::chrono::duration<double, std::milli>, 5> time;
time[0] = test(
vec, significand_size, integral_size,
[](char *buf, unsigned val, int significand_size, int integral_size) {
return write_significand(buf, val, significand_size, integral_size,
'.');
});
time[1] = test(
vec, significand_size, integral_size,
[](char *buf, unsigned val, int significand_size, int integral_size) {
return write_significand2(buf, val, significand_size, integral_size,
'.');
});
time[2] = test(
vec, significand_size, integral_size,
[](char *buf, unsigned val, int significand_size, int integral_size) {
return write_significand3(buf, val, significand_size, integral_size,
'.');
});
time[3] = test(
vec, significand_size, integral_size,
[](char *buf, unsigned val, int significand_size, int integral_size) {
return write_significand4(buf, val, significand_size, integral_size,
'.');
});
time[4] = test(
vec, significand_size, integral_size,
[](char *buf, unsigned val, int significand_size, int integral_size) {
return write_significand5(buf, val, significand_size, integral_size,
'.');
});
printf("%.2d, %.2d) %.3lf %.3lf %.3lf %.3lf\n", integral_size, floating_size,
time[0] / time[1], time[0] / time[2], time[0] / time[3],
time[0] / time[4]);
}
}
} |
Thanks for the benchmark, it looks like you are using I tested it on clang with
Is it possible to avoid the regression? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should go with write_significand2
which has smaller regression for the single-digit integral case.
Optimization can be added, but is it needed? Why is the single-valued whole part a frequent occurrence?
|
Single-digit integral part is very common because it's used in the exponential, general and default formats except for small(-ish) exponent. We shouldn't regress it too much. |
Merged, thanks. The single-digit integral part can be optimized later. |
The new version gives an increase of about 2 times, except for the case when the size of the integer part is 1 (since in the original code this case is specially optimized)
Tests
integral_size, floating_size) Speed
1, 1) 0.967493
1, 2) 1.366119
1, 3) 1.016240
1, 4) 1.194619
2, 1) 2.453043
2, 2) 2.560216
2, 3) 2.554420
2, 4) 1.944176
3, 1) 2.198698
3, 2) 2.430826
3, 3) 2.032839
3, 4) 2.088472
4, 1) 2.366449
4, 2) 2.460539
4, 3) 2.084301
4, 4) 2.077537