Using templates to force unrolling and removing branches from hot loops

DiamonDinoia · DiamonDinoia · commit ad86ffa640f4 · 2025-10-15T23:05:16.000-04:00
diff --git a/include/common/utils.h b/include/common/utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <array>
+#include <cstdint>
 #include <tuple>
 #include <type_traits>
 #include <utility>
@@ -100,7 +101,7 @@ template<typename Tuple> auto extract_vals(const Tuple &t) {
 }
 
 template<typename Tuple, std::size_t... I>
-auto extract_seqs_impl(const Tuple &t, std::index_sequence<I...>) {
+auto extract_seqs_impl(const Tuple &, std::index_sequence<I...>) {
   using T = std::remove_reference_t<Tuple>;
   return std::make_tuple(typename std::tuple_element_t<I, T>::seq_type{}...);
 }
@@ -145,5 +146,104 @@ decltype(auto) dispatch(Func &&func, ParamTuple &&params, Args &&...args) {
   if constexpr (!std::is_void_v<result_t>) return caller.result;
 }
 
+// ----------------------------------------------
+// compile-time range length for [Start, Stop) with step Inc
+// ----------------------------------------------
+template<std::int64_t Start, std::int64_t Stop, std::int64_t Inc>
+inline constexpr std::int64_t compute_range_count = [] {
+  static_assert(Inc != 0, "Inc must not be zero");
+  if constexpr (Inc > 0) {
+    const std::int64_t d = Stop - Start;
+    return d > 0 ? ((d + Inc - 1) / Inc) : 0;
+  } else {
+    const std::int64_t d = Start - Stop;
+    const std::int64_t s = -Inc;
+    return d > 0 ? ((d + s - 1) / s) : 0;
+  }
+}();
+
+// ----------------------------------------------
+// low-level emitters (C++17, no templated lambdas)
+// ----------------------------------------------
+template<std::int64_t Start, std::int64_t Inc, class F, std::size_t... Is>
+constexpr void static_loop_impl_block(F &&f, std::index_sequence<Is...>) {
+  (f(std::integral_constant<std::int64_t, Start + static_cast<std::int64_t>(Is) * Inc>{}),
+   ...);
+}
+
+template<std::int64_t Start, std::int64_t Inc, std::int64_t K, class F, std::size_t... Bs>
+constexpr void static_loop_emit_all_blocks(F &&f, std::index_sequence<Bs...>) {
+  static_assert(K > 0, "UNROLL K must be positive");
+  (static_loop_impl_block<Start + static_cast<std::int64_t>(Bs) * K * Inc, Inc>(
+       f, std::make_index_sequence<static_cast<std::size_t>(K)>{}),
+   ...);
+}
+
+// ----------------------------------------------
+// static_loop
+// ----------------------------------------------
+template<std::int64_t Start, std::int64_t Stop, std::int64_t Inc = 1,
+         std::int64_t UNROLL = compute_range_count<Start, Stop, Inc>, class F>
+constexpr void static_loop(F &&f) {
+  static_assert(Inc != 0, "Inc must not be zero");
+
+  constexpr std::int64_t Count = compute_range_count<Start, Stop, Inc>;
+  if constexpr (Count == 0) {
+    // do nothing
+  } else {
+    constexpr std::int64_t k = (UNROLL > 0 ? UNROLL : Count);
+    static_assert(k > 0, "internal: k must be positive");
+    constexpr std::int64_t Blocks = Count / k;
+    constexpr std::int64_t Tail   = Count % k;
+
+    if constexpr (Blocks > 0) {
+      static_loop_emit_all_blocks<Start, Inc, k>(
+          std::forward<F>(f),
+          std::make_index_sequence<static_cast<std::size_t>(Blocks)>{});
+    }
+    if constexpr (Tail > 0) {
+      constexpr std::int64_t TailStart = Start + Blocks * k * Inc;
+      static_loop_impl_block<TailStart, Inc>(
+          std::forward<F>(f), std::make_index_sequence<static_cast<std::size_t>(Tail)>{});
+    }
+  }
+}
+
+// convenience: Stop only => Start=0, Inc=1
+template<std::int64_t Stop, class F> constexpr void static_loop(F &&f) {
+  static_loop<0, Stop, 1, compute_range_count<0, Stop, 1>>(std::forward<F>(f));
+}
+
+// ----------------------------------------------
+// static_for wrappers expecting f.template operator()<I>()
+// keeps C++17 by adapting to integral_constant form above
+// ----------------------------------------------
+namespace detail {
+template<class F> struct as_template_index {
+  F *pf;
+  constexpr explicit as_template_index(F &f) : pf(&f) {}
+  template<std::int64_t I>
+  constexpr void operator()(std::integral_constant<std::int64_t, I>) const {
+    pf->template operator()<I>();
+  }
+};
+template<class T>
+using is_integral_cx = std::integral_constant<bool, std::is_integral_v<T>>;
+} // namespace detail
+
+template<auto Start, auto End, class F> constexpr void static_for(F &&f) {
+  static_assert(detail::is_integral_cx<decltype(Start)>::value &&
+                    detail::is_integral_cx<decltype(End)>::value,
+                "Start/End must be integral constant expressions");
+  static_assert(End >= Start, "End must be >= Start");
+
+  constexpr auto S = static_cast<std::int64_t>(Start);
+  constexpr auto E = static_cast<std::int64_t>(End);
+  static_loop<S, E, 1>(detail::as_template_index<F>{f});
+}
+
+template<auto End, class F> constexpr void static_for(F &&f) {
+  static_for<0, End>(std::forward<F>(f));
+}
 } // namespace common
 } // namespace finufft
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
@@ -95,8 +95,10 @@ template<class T, uint8_t N> constexpr auto GetPaddedSIMDWidth() {
   // that minimizes the number of iterations
   return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
 }
+
 template<class T, uint8_t N>
 using PaddedSIMD = typename xsimd::make_sized_batch<T, GetPaddedSIMDWidth<T, N>()>::type;
+
 template<class T, uint8_t ns> constexpr auto get_padding() {
   // helper function to get the padding for the given number of elements
   // ns is known at compile time, rounds ns to the next multiple of the SIMD width
@@ -130,8 +132,10 @@ template<class T> uint8_t get_padding(uint8_t ns) {
   // that's why is hardcoded here
   return get_padding_helper<T, 2 * MAX_NSPREAD>(ns);
 }
+
 template<class T, uint8_t N>
 using BestSIMD = typename decltype(BestSIMDHelper<T, N, xsimd::batch<T>::size>())::type;
+
 template<class T, class V, size_t... Is>
 constexpr T generate_sequence_impl(V a, V b, index_sequence<Is...>) noexcept {
   // utility function to generate a sequence of a, b interleaved as function arguments
@@ -278,43 +282,46 @@ static void evaluate_kernel_vector(T *ker, T *args,
 }
 
 template<typename T, uint8_t w, uint8_t upsampfact,
-         class simd_type =
-             xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, w>()>> // aka ns
+         class simd_type = xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, w>()>>
 static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(T *FINUFFT_RESTRICT ker, T x,
                                                          const finufft_spread_opts &opts
-                                                         [[maybe_unused]]) noexcept
-/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
-x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
-This is the current evaluation method, since it's faster (except i7 w=16).
-Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
+                                                         [[maybe_unused]]) noexcept {
+  /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
+  x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
+  This is the current evaluation method, since it's faster (except i7 w=16).
+  Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
+  // scale so local grid offset z in [-1,1]
+  const T z                       = std::fma(T(2.0), x, T(w - 1));
+  using arch_t                    = typename simd_type::arch_type;
+  static constexpr auto alignment = arch_t::alignment();
+  static constexpr auto simd_size = simd_type::size;
+  static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1);
 
-{
-  // scale so local grid offset z in[-1,1]
-  const T z                           = std::fma(T(2.0), x, T(w - 1));
-  using arch_t                        = typename simd_type::arch_type;
-  static constexpr auto alignment     = arch_t::alignment();
-  static constexpr auto simd_size     = simd_type::size;
-  static constexpr auto padded_ns     = (w + simd_size - 1) & ~(simd_size - 1);
   static constexpr auto horner_coeffs = []() constexpr noexcept {
     if constexpr (upsampfact == 200) {
       return get_horner_coeffs_200<T, w>();
-    } else if constexpr (upsampfact == 125) {
+    } else { // upsampfact == 125
+      // this way we can have a static assert that things went wrong
+      static_assert(upsampfact == 125, "Unsupported upsampfact");
       return get_horner_coeffs_125<T, w>();
     }
   }();
-  static constexpr auto nc          = horner_coeffs.size();
-  static constexpr auto use_ker_sym = (simd_size < w);
+  static constexpr auto nc = horner_coeffs.size();
 
   alignas(alignment) static constexpr auto padded_coeffs =
       pad_2D_array_with_zeros<T, nc, w, padded_ns>(horner_coeffs);
 
   // use kernel symmetry trick if w > simd_size
+  static constexpr bool use_ker_sym = (simd_size < w);
+
+  const simd_type zv{z};
+
   if constexpr (use_ker_sym) {
     static constexpr uint8_t tail          = w % simd_size;
     static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
-    static constexpr uint8_t offset_start  = tail ? w - tail : w - simd_size;
     static constexpr uint8_t end_idx       = (w + (tail > 0)) / 2;
-    const simd_type zv{z};
+    static constexpr uint8_t offset_start  = tail ? (w - tail) : (w - simd_size);
+
     const auto z2v = zv * zv;
 
     // some xsimd constant for shuffle or inverse
@@ -328,30 +335,32 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
       }
     }();
 
-    // process simd vecs
     simd_type k_prev, k_sym{0};
-    for (uint8_t i{0}, offset = offset_start; i < end_idx;
-         i += simd_size, offset -= simd_size) {
-      auto k_odd = [i]() constexpr noexcept {
+
+    static_loop<0, end_idx, simd_size>([&]([[maybe_unused]] const auto i) {
+      constexpr auto offset = static_cast<uint8_t>(offset_start - i);
+      auto k_odd            = [i]() constexpr noexcept {
         if constexpr (if_odd_degree) {
           return simd_type::load_aligned(padded_coeffs[0].data() + i);
         } else {
           return simd_type{0};
         }
       }();
       auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
-      for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) {
+
+      static_loop<1 + if_odd_degree, static_cast<std::int64_t>(nc), 2>([&](const auto j) {
         const auto cji_odd  = simd_type::load_aligned(padded_coeffs[j].data() + i);
         const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
         k_odd               = xsimd::fma(k_odd, z2v, cji_odd);
         k_even              = xsimd::fma(k_even, z2v, cji_even);
-      }
+      });
+
       // left part
       xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
+
       // right part symmetric to the left part
-      if (offset >= end_idx) {
+      if constexpr (offset >= end_idx) {
         if constexpr (tail) {
-          // to use aligned store, we need shuffle the previous k_sym and current k_sym
           k_prev = k_sym;
           k_sym  = xsimd::fnma(k_odd, zv, k_even);
           xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
@@ -360,20 +369,21 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
               .store_aligned(ker + offset);
         }
       }
-    }
+    });
+
   } else {
-    const simd_type zv(z);
-    for (uint8_t i = 0; i < w; i += simd_size) {
+    static_loop<0, w, simd_size>([&](const auto i) {
       auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
-      for (uint8_t j = 1; j < nc; ++j) {
+
+      static_loop<1, static_cast<std::int64_t>(nc), 1>([&](const auto j) {
         const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
         k              = xsimd::fma(k, zv, cji);
-      }
+      });
+
       k.store_aligned(ker + i);
-    }
+    });
   }
 }
-
 template<typename T, uint8_t ns>
 static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker,
                              const BIGINT i1, const UBIGINT N1) {