adbancroft
diff --git a/‎CMakeLists.txt
+4 b/‎CMakeLists.txt
+4
diff --git a/‎README.md
+2-1 b/‎README.md
+2-1
diff --git a/‎constant_fast_div.h
+107 b/‎constant_fast_div.h
+107
diff --git a/‎libdivide.h
+25-9 b/‎libdivide.h
+25-9
@@ -256,23 +256,27 @@ if (BUILD_TESTS)
 
     add_executable(tester test/tester.cpp)
     add_executable(test_c99 test/test_c99.c)
+    add_executable(fast_div_generator test/fast_div_generator.cpp)
     add_executable(benchmark test/benchmark.cpp)
     add_executable(benchmark_branchfree test/benchmark_branchfree.cpp)
 
     target_link_libraries(tester libdivide Threads::Threads)
     target_link_libraries(test_c99 libdivide)
+    target_link_libraries(fast_div_generator libdivide)
     target_link_libraries(benchmark libdivide)
     target_link_libraries(benchmark_branchfree libdivide)
 
     target_compile_options(tester PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
     target_compile_options(test_c99 PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
+    target_compile_options(fast_div_generator PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
     target_compile_options(benchmark PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE_C}")
     target_compile_options(benchmark_branchfree PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
     set_property(TARGET benchmark_branchfree PROPERTY CXX_STANDARD 11)
     set_property(TARGET test_c99 PROPERTY C_STANDARD 99)
 
     target_compile_definitions(tester PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
     target_compile_definitions(test_c99 PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
+    target_compile_definitions(fast_div_generator PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
     target_compile_definitions(benchmark PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
     target_compile_definitions(benchmark_branchfree PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
 endif()
 
@@ -19,7 +19,7 @@ vector division which provides an even larger speedup. You can test how much
 speedup you can achieve on your CPU using the [benchmark](#benchmark-program)
 program.
 
-libdivide is compatible with 8-bit microcontrollers, such as the AVR series: [the CI build includes a AtMega2560 target](test/avr/readme.md). Since low end hardware such as this often do not include a hardware divider, libdivide is particulary useful.
+libdivide is compatible with 8-bit microcontrollers, such as the AVR series: [the CI build includes a AtMega2560 target](test/avr/readme.md). Since low end hardware such as this often do not include a hardware divider, libdivide is particulary useful. In addition to the runtime [C](https://github.com/ridiculousfish/libdivide/blob/master/doc/C-API.md) & [C++](https://github.com/ridiculousfish/libdivide/blob/master/doc/CPP-API.md) APIs, a set of [predefined macros](constant_fast_div.h) is included to speed up division by 16-bit constants: division by a 16-bit constant is [not optimized by avr-gcc on 8-bit systems](https://stackoverflow.com/questions/47994933/why-doesnt-gcc-or-clang-on-arm-use-division-by-invariant-integers-using-multip). 
 
 See https://libdivide.com for more information on libdivide.
 
@@ -83,6 +83,7 @@ void divide(int64_t *array, size_t size, int64_t divisor)
 
 * [C API](https://github.com/ridiculousfish/libdivide/blob/master/doc/C-API.md)
 * [C++ API](https://github.com/ridiculousfish/libdivide/blob/master/doc/CPP-API.md)
+* [Invariant Division](constant_fast_div.h)
 
 # Branchfull vs branchfree
 
 
@@ -0,0 +1,107 @@
+/*
+* When dividing by a known compile time constant, the division can be replaced
+* by a multiply+shift operation. GCC will do this automatically, 
+* *BUT ONLY FOR DIVISION OF REGISTER-WIDTH OR NARROWER*.
+*
+* So on an 8-bit system, 16-bit divides will *NOT* be optimised.
+*
+* The macros here manually apply the multiply+shift operation for 16-bit numbers.
+*
+* Testing on an AtMega2560, -O3 optimizations:
+*   Performance improvement of 85% to 90%+ speed up (division by non-powers of 2)
+*   Zero increase in RAM usage
+*   Average of 25 bytes Flash used per call site
+*     Be careful calling this in a loop with aggressive loop unrolling!
+*  
+* Note: testing of the multiply+shift technique on 8-bit division showed a 
+* slight slow down over native code on AtMega2560. So the 8 bit equivalent 
+* macros have not been included
+*/
+
+#pragma once
+#include "libdivide.h"
+#include "u16_ldparams.h"
+#include "s16_ldparams.h"
+
+#define CAT_HELPER(a, b) a ## b
+#define CONCAT(A, B) CAT_HELPER(A, B)
+
+// GCC will optimise division by a power of 2
+// So allow that.
+#define S16_ISPOW2_NEG(denom) \
+ (denom==-2 || \
+  denom==-4 || \
+  denom==-8 || \
+  denom==-16 || \
+  denom==-32 || \
+  denom==-64 || \
+  denom==-128 || \
+  denom==-256 || \
+  denom==-512 || \
+  denom==-1024 || \
+  denom==-2048 || \
+  denom==-4096 || \
+  denom==-8192 || \
+  denom==-16384)
+#define S16_ISPOW2_POS(denom) \
+ (denom==2 || \
+  denom==4 || \
+  denom==8 || \
+  denom==16 || \
+  denom==32 || \
+  denom==64 || \
+  denom==128 || \
+  denom==256 || \
+  denom==512 || \
+  denom==1024 || \
+  denom==2048 || \
+  denom==4096 || \
+  denom==8192 || \
+  denom==16384)
+#define U16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || denom==32768)
+#define S16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || S16_ISPOW2_NEG(denom))
+
+// Apply the libdivide namespace if necessary
+#ifdef __cplusplus
+#define LIB_DIV_NAMESPACE libdivide::
+#else
+#define LIB_DIV_NAMESPACE
+#endif
+
+/*
+* Wrapper for *unsigned* 16-bit DIVISION. The divisor must be a compile time
+* constant.
+* E.g. FAST_DIV16U(value, 100)
+*/
+#define U16_MAGIC(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MAGIC)
+#define U16_MORE(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MORE)
+#define FAST_DIV16U(a, d) (U16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_u16_do_raw(a, U16_MAGIC(d), U16_MORE(d)))
+
+/*
+* Wrapper for *signed* 16-bit DIVISION by a *POSITIVE* compile time constant. 
+* E.g. FAST_DIV16(-value, 777)
+*
+* This only works for positive parmeters :-(
+* A negative number results in a hypen in the macro name, which is not allowed
+*/
+#define S16_MAGIC(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MAGIC)
+#define S16_MORE(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MORE)
+#define FAST_DIV16(a, d) (S16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC(d), S16_MORE(d))) 
+
+/*
+* Wrapper for *signed* 16-bit DIVISION by a *NEGATIVE* compile time constant. 
+* E.g. FAST_DIV16_NEG(-value, 777) // <-- It's converted to negative. Really.
+*
+* This only works for positive parmeters :-(
+* A negative number results in a hypen in the macro name, which is not allowed
+*/
+#define S16_MAGIC_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MAGIC)
+#define S16_MORE_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MORE)
+#define FAST_DIV16_NEG(a, d) (S16_ISPOW2(d) ? a/-d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC_NEG(d), S16_MORE_NEG(d))) 
+
+/*
+* Wrapper for *unsigned* 16-bit MODULUS. The divisor must be a compile time
+* constant. 
+* E.g. FAST_MOD16U(value, 6)
+*/
+#define FAST_MOD16U(a, d) (a - (FAST_DIV16U(a, d) * d))
@@ -229,8 +229,12 @@ static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfr
 static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
 static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
 
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
+    int16_t numer, int16_t magic, uint8_t more);
 static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
     int16_t numer, const struct libdivide_s16_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
+    uint16_t numer, uint16_t magic, uint8_t more);    
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
     uint16_t numer, const struct libdivide_u16_t* denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
@@ -736,13 +740,15 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
     return ret;
 }
 
-uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
-    uint8_t more = denom->more;
-    if (!denom->magic) {
+// The original libdivide_u16_do takes a const pointer. However, this cannot be used
+// with a compile time constant libdivide_u16_t: it will generate a warning about
+// taking the address of a temporary. Hence this overload.
+uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
+    if (!magic) {
         return numer >> more;
     }
     else {
-        uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
+        uint16_t q = libdivide_mullhi_u16(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint16_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
@@ -752,7 +758,11 @@ uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
             // don't need to mask them off.
             return q >> more;
         }
-    }
+    }    
+}
+
+uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
+    return libdivide_u16_do_raw(numer, denom->magic, denom->more);
 }
 
 uint16_t libdivide_u16_branchfree_do(
@@ -1237,11 +1247,13 @@ struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
     return result;
 }
 
-int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
-    uint8_t more = denom->more;
+// The original libdivide_s16_do takes a const pointer. However, this cannot be used
+// with a compile time constant libdivide_s16_t: it will generate a warning about
+// taking the address of a temporary. Hence this overload.
+int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
 
-    if (!denom->magic) {
+    if (!magic) {
         uint16_t sign = (int8_t)more >> 7;
         uint16_t mask = ((uint16_t)1 << shift) - 1;
         uint16_t uq = numer + ((numer >> 15) & mask);
@@ -1250,7 +1262,7 @@ int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
         q = (q ^ sign) - sign;
         return q;
     } else {
-        uint16_t uq = (uint16_t)libdivide_mullhi_s16(denom->magic, numer);
+        uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift and then sign extend
             int16_t sign = (int8_t)more >> 7;
@@ -1265,6 +1277,10 @@ int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
     }
 }
 
+int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
+    return libdivide_s16_do_raw(numer, denom->magic, denom->more);
+}
+
 int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;