Skip to content

Commit 8c401c8

Browse files
authored
Constant division macros (ridiculousfish#76)
* Add division by compile time constant macros * Add tests for constant division * Multiple compile fixes for various platforms. * Add constant tests
1 parent ef645e2 commit 8c401c8

18 files changed

+132103
-33
lines changed

CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -256,23 +256,27 @@ if (BUILD_TESTS)
256256

257257
add_executable(tester test/tester.cpp)
258258
add_executable(test_c99 test/test_c99.c)
259+
add_executable(fast_div_generator test/fast_div_generator.cpp)
259260
add_executable(benchmark test/benchmark.cpp)
260261
add_executable(benchmark_branchfree test/benchmark_branchfree.cpp)
261262

262263
target_link_libraries(tester libdivide Threads::Threads)
263264
target_link_libraries(test_c99 libdivide)
265+
target_link_libraries(fast_div_generator libdivide)
264266
target_link_libraries(benchmark libdivide)
265267
target_link_libraries(benchmark_branchfree libdivide)
266268

267269
target_compile_options(tester PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
268270
target_compile_options(test_c99 PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
271+
target_compile_options(fast_div_generator PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
269272
target_compile_options(benchmark PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE_C}")
270273
target_compile_options(benchmark_branchfree PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
271274
set_property(TARGET benchmark_branchfree PROPERTY CXX_STANDARD 11)
272275
set_property(TARGET test_c99 PROPERTY C_STANDARD 99)
273276

274277
target_compile_definitions(tester PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
275278
target_compile_definitions(test_c99 PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
279+
target_compile_definitions(fast_div_generator PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
276280
target_compile_definitions(benchmark PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
277281
target_compile_definitions(benchmark_branchfree PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
278282
endif()

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ vector division which provides an even larger speedup. You can test how much
1919
speedup you can achieve on your CPU using the [benchmark](#benchmark-program)
2020
program.
2121

22-
libdivide is compatible with 8-bit microcontrollers, such as the AVR series: [the CI build includes a AtMega2560 target](test/avr/readme.md). Since low end hardware such as this often do not include a hardware divider, libdivide is particulary useful.
22+
libdivide is compatible with 8-bit microcontrollers, such as the AVR series: [the CI build includes a AtMega2560 target](test/avr/readme.md). Since low end hardware such as this often do not include a hardware divider, libdivide is particulary useful. In addition to the runtime [C](https://github.com/ridiculousfish/libdivide/blob/master/doc/C-API.md) & [C++](https://github.com/ridiculousfish/libdivide/blob/master/doc/CPP-API.md) APIs, a set of [predefined macros](constant_fast_div.h) is included to speed up division by 16-bit constants: division by a 16-bit constant is [not optimized by avr-gcc on 8-bit systems](https://stackoverflow.com/questions/47994933/why-doesnt-gcc-or-clang-on-arm-use-division-by-invariant-integers-using-multip).
2323

2424
See https://libdivide.com for more information on libdivide.
2525

@@ -83,6 +83,7 @@ void divide(int64_t *array, size_t size, int64_t divisor)
8383
8484
* [C API](https://github.com/ridiculousfish/libdivide/blob/master/doc/C-API.md)
8585
* [C++ API](https://github.com/ridiculousfish/libdivide/blob/master/doc/CPP-API.md)
86+
* [Invariant Division](constant_fast_div.h)
8687
8788
# Branchfull vs branchfree
8889

constant_fast_div.h

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/*
2+
* When dividing by a known compile time constant, the division can be replaced
3+
* by a multiply+shift operation. GCC will do this automatically,
4+
* *BUT ONLY FOR DIVISION OF REGISTER-WIDTH OR NARROWER*.
5+
*
6+
* So on an 8-bit system, 16-bit divides will *NOT* be optimised.
7+
*
8+
* The macros here manually apply the multiply+shift operation for 16-bit numbers.
9+
*
10+
* Testing on an AtMega2560, -O3 optimizations:
11+
* Performance improvement of 85% to 90%+ speed up (division by non-powers of 2)
12+
* Zero increase in RAM usage
13+
* Average of 25 bytes Flash used per call site
14+
* Be careful calling this in a loop with aggressive loop unrolling!
15+
*
16+
* Note: testing of the multiply+shift technique on 8-bit division showed a
17+
* slight slow down over native code on AtMega2560. So the 8 bit equivalent
18+
* macros have not been included
19+
*/
20+
21+
#pragma once
22+
#include "libdivide.h"
23+
#include "u16_ldparams.h"
24+
#include "s16_ldparams.h"
25+
26+
#define CAT_HELPER(a, b) a ## b
27+
#define CONCAT(A, B) CAT_HELPER(A, B)
28+
29+
// GCC will optimise division by a power of 2
30+
// So allow that.
31+
#define S16_ISPOW2_NEG(denom) \
32+
(denom==-2 || \
33+
denom==-4 || \
34+
denom==-8 || \
35+
denom==-16 || \
36+
denom==-32 || \
37+
denom==-64 || \
38+
denom==-128 || \
39+
denom==-256 || \
40+
denom==-512 || \
41+
denom==-1024 || \
42+
denom==-2048 || \
43+
denom==-4096 || \
44+
denom==-8192 || \
45+
denom==-16384)
46+
#define S16_ISPOW2_POS(denom) \
47+
(denom==2 || \
48+
denom==4 || \
49+
denom==8 || \
50+
denom==16 || \
51+
denom==32 || \
52+
denom==64 || \
53+
denom==128 || \
54+
denom==256 || \
55+
denom==512 || \
56+
denom==1024 || \
57+
denom==2048 || \
58+
denom==4096 || \
59+
denom==8192 || \
60+
denom==16384)
61+
#define U16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || denom==32768)
62+
#define S16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || S16_ISPOW2_NEG(denom))
63+
64+
// Apply the libdivide namespace if necessary
65+
#ifdef __cplusplus
66+
#define LIB_DIV_NAMESPACE libdivide::
67+
#else
68+
#define LIB_DIV_NAMESPACE
69+
#endif
70+
71+
/*
72+
* Wrapper for *unsigned* 16-bit DIVISION. The divisor must be a compile time
73+
* constant.
74+
* E.g. FAST_DIV16U(value, 100)
75+
*/
76+
#define U16_MAGIC(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MAGIC)
77+
#define U16_MORE(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MORE)
78+
#define FAST_DIV16U(a, d) (U16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_u16_do_raw(a, U16_MAGIC(d), U16_MORE(d)))
79+
80+
/*
81+
* Wrapper for *signed* 16-bit DIVISION by a *POSITIVE* compile time constant.
82+
* E.g. FAST_DIV16(-value, 777)
83+
*
84+
* This only works for positive parmeters :-(
85+
* A negative number results in a hypen in the macro name, which is not allowed
86+
*/
87+
#define S16_MAGIC(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MAGIC)
88+
#define S16_MORE(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MORE)
89+
#define FAST_DIV16(a, d) (S16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC(d), S16_MORE(d)))
90+
91+
/*
92+
* Wrapper for *signed* 16-bit DIVISION by a *NEGATIVE* compile time constant.
93+
* E.g. FAST_DIV16_NEG(-value, 777) // <-- It's converted to negative. Really.
94+
*
95+
* This only works for positive parmeters :-(
96+
* A negative number results in a hypen in the macro name, which is not allowed
97+
*/
98+
#define S16_MAGIC_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MAGIC)
99+
#define S16_MORE_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MORE)
100+
#define FAST_DIV16_NEG(a, d) (S16_ISPOW2(d) ? a/-d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC_NEG(d), S16_MORE_NEG(d)))
101+
102+
/*
103+
* Wrapper for *unsigned* 16-bit MODULUS. The divisor must be a compile time
104+
* constant.
105+
* E.g. FAST_MOD16U(value, 6)
106+
*/
107+
#define FAST_MOD16U(a, d) (a - (FAST_DIV16U(a, d) * d))

libdivide.h

+25-9
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,12 @@ static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfr
229229
static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
230230
static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
231231

232+
static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
233+
int16_t numer, int16_t magic, uint8_t more);
232234
static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
233235
int16_t numer, const struct libdivide_s16_t* denom);
236+
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
237+
uint16_t numer, uint16_t magic, uint8_t more);
234238
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
235239
uint16_t numer, const struct libdivide_u16_t* denom);
236240
static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
@@ -736,13 +740,15 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
736740
return ret;
737741
}
738742

739-
uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
740-
uint8_t more = denom->more;
741-
if (!denom->magic) {
743+
// The original libdivide_u16_do takes a const pointer. However, this cannot be used
744+
// with a compile time constant libdivide_u16_t: it will generate a warning about
745+
// taking the address of a temporary. Hence this overload.
746+
uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
747+
if (!magic) {
742748
return numer >> more;
743749
}
744750
else {
745-
uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
751+
uint16_t q = libdivide_mullhi_u16(magic, numer);
746752
if (more & LIBDIVIDE_ADD_MARKER) {
747753
uint16_t t = ((numer - q) >> 1) + q;
748754
return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
@@ -752,7 +758,11 @@ uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
752758
// don't need to mask them off.
753759
return q >> more;
754760
}
755-
}
761+
}
762+
}
763+
764+
uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
765+
return libdivide_u16_do_raw(numer, denom->magic, denom->more);
756766
}
757767

758768
uint16_t libdivide_u16_branchfree_do(
@@ -1237,11 +1247,13 @@ struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
12371247
return result;
12381248
}
12391249

1240-
int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
1241-
uint8_t more = denom->more;
1250+
// The original libdivide_s16_do takes a const pointer. However, this cannot be used
1251+
// with a compile time constant libdivide_s16_t: it will generate a warning about
1252+
// taking the address of a temporary. Hence this overload.
1253+
int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
12421254
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
12431255

1244-
if (!denom->magic) {
1256+
if (!magic) {
12451257
uint16_t sign = (int8_t)more >> 7;
12461258
uint16_t mask = ((uint16_t)1 << shift) - 1;
12471259
uint16_t uq = numer + ((numer >> 15) & mask);
@@ -1250,7 +1262,7 @@ int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
12501262
q = (q ^ sign) - sign;
12511263
return q;
12521264
} else {
1253-
uint16_t uq = (uint16_t)libdivide_mullhi_s16(denom->magic, numer);
1265+
uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer);
12541266
if (more & LIBDIVIDE_ADD_MARKER) {
12551267
// must be arithmetic shift and then sign extend
12561268
int16_t sign = (int8_t)more >> 7;
@@ -1265,6 +1277,10 @@ int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
12651277
}
12661278
}
12671279

1280+
int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
1281+
return libdivide_s16_do_raw(numer, denom->magic, denom->more);
1282+
}
1283+
12681284
int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
12691285
uint8_t more = denom->more;
12701286
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;

0 commit comments

Comments
 (0)