adbancroft
diff --git a/‎README.md
+3-2 b/‎README.md
+3-2
diff --git a/‎constant_fast_div.hpp
+76 b/‎constant_fast_div.hpp
+76
@@ -19,7 +19,7 @@ vector division which provides an even larger speedup. You can test how much
 speedup you can achieve on your CPU using the [benchmark](#benchmark-program)
 program.
 
-libdivide is compatible with 8-bit microcontrollers, such as the AVR series: [the CI build includes a AtMega2560 target](test/avr/readme.md). Since low end hardware such as this often do not include a hardware divider, libdivide is particulary useful. In addition to the runtime [C](https://github.com/ridiculousfish/libdivide/blob/master/doc/C-API.md) & [C++](https://github.com/ridiculousfish/libdivide/blob/master/doc/CPP-API.md) APIs, a set of [predefined macros](constant_fast_div.h) is included to speed up division by 16-bit constants: division by a 16-bit constant is [not optimized by avr-gcc on 8-bit systems](https://stackoverflow.com/questions/47994933/why-doesnt-gcc-or-clang-on-arm-use-division-by-invariant-integers-using-multip). 
+libdivide is compatible with 8-bit microcontrollers, such as the AVR series: [the CI build includes a AtMega2560 target](test/avr/readme.md). Since low end hardware such as this often do not include a hardware divider, libdivide is particulary useful. In addition to the runtime [C](https://github.com/ridiculousfish/libdivide/blob/master/doc/C-API.md) & [C++](https://github.com/ridiculousfish/libdivide/blob/master/doc/CPP-API.md) APIs, a set of [predefined macros](constant_fast_div.h) and [templates](constant_fast_div.hpp) is included to speed up division by 16-bit constants: division by a 16-bit constant is [not optimized by avr-gcc on 8-bit systems](https://stackoverflow.com/questions/47994933/why-doesnt-gcc-or-clang-on-arm-use-division-by-invariant-integers-using-multip). 
 
 See https://libdivide.com for more information on libdivide.
 
@@ -83,7 +83,8 @@ void divide(int64_t *array, size_t size, int64_t divisor)
 
 * [C API](https://github.com/ridiculousfish/libdivide/blob/master/doc/C-API.md)
 * [C++ API](https://github.com/ridiculousfish/libdivide/blob/master/doc/CPP-API.md)
-* [Invariant Division](constant_fast_div.h)
+* [Macro Invariant Division](constant_fast_div.h)
+* [Template Based Invariant Division](constant_fast_div.hpp)
 
 # Branchfull vs branchfree
 
 
@@ -0,0 +1,76 @@
+/*
+* When dividing by a known compile time constant, the division can be replaced
+* by a multiply+shift operation. GCC will do this automatically, 
+* *BUT ONLY FOR DIVISION OF REGISTER-WIDTH OR NARROWER*.
+*
+* So on an 8-bit system, 16-bit divides will *NOT* be optimised.
+*
+* The templates here manually apply the multiply+shift operation for 16-bit numbers.
+*/
+
+#pragma once
+#include "libdivide.h"
+#include "u16_ldparams.h"
+#include "s16_ldparams.h"
+
+#ifdef __cplusplus
+namespace libdivide {
+  
+  // Implementation details
+  namespace detail {
+
+    // Specialized templates containing precomputed libdivide constants
+    // Primary template for pre-generated libdivide constants
+    template<typename IntT, IntT divisor> struct libdivide_constants {};
+    #include "u16_ldparams.hpp"
+    #include "s16_ldparams.hpp"
+
+    // Primary template - divide as normal. Performant for divisors that are a power of 2
+    template <typename T, T divisor, bool is_power2>
+    struct fast_divide_t { 
+        static LIBDIVIDE_INLINE T divide(T n) { return n/divisor; }
+    };
+
+    // Divide by 1 - no-op
+    template <bool is_power2>
+    struct fast_divide_t<uint16_t, 1U, is_power2> { 
+        static LIBDIVIDE_INLINE uint16_t divide(uint16_t n) { return n; } 
+    };
+    template <bool is_power2>
+    struct fast_divide_t<int16_t, 1, is_power2> { 
+        static LIBDIVIDE_INLINE int16_t divide(int16_t n) { return n; }
+    };
+
+    // Specialzed template for non-power of 2 uint16_t divisors
+    template<uint16_t divisor>
+    struct fast_divide_t<uint16_t, divisor, false> {
+      static LIBDIVIDE_INLINE uint16_t divide(uint16_t n) { 
+        return libdivide_u16_do_raw(n, libdivide_constants<uint16_t, divisor>::libdivide.magic, 
+                                       libdivide_constants<uint16_t, divisor>::libdivide.more);
+      }
+    };
+
+    // Specialzed template for non-power of 2 int16_t divisors
+    template<int16_t divisor>
+    struct fast_divide_t<int16_t, divisor, false> {
+      static LIBDIVIDE_INLINE int16_t divide(int16_t n) { 
+        return libdivide_s16_do_raw(n, libdivide_constants<int16_t, divisor>::libdivide.magic,
+                                       libdivide_constants<int16_t, divisor>::libdivide.more); 
+      }
+    };
+
+    // Power of 2 test
+    template <typename T, T N>
+    struct is_power_of_two {
+        static constexpr bool val = N!=0 && (N & (N - 1))==0;
+    };
+  }
+
+  // Public API. 
+  template <typename T, T divisor>
+  LIBDIVIDE_INLINE T fast_divide(T n) {
+      return detail::fast_divide_t<T, divisor, detail::is_power_of_two<T, divisor>::val>::divide(n);
+  }
+
+}
+#endif