AztecProtocol · iakovenkos · Jan 23, 2026 · Dec 23, 2025 · Dec 23, 2025 · Dec 24, 2025
diff --git a/barretenberg/cpp/CLAUDE.md b/barretenberg/cpp/CLAUDE.md
@@ -103,3 +103,24 @@ BB_VERBOSE=1 yarn-project/scripts/run_test.sh ivc-integration/src/rollup_ivc_int
 ````
 
 When making barretenberg changes, ensure these tests still pass.
+
+## Benchmarking:
+
+**IMPORTANT**: In the barretenberg context, "bench" or "benchmark" almost always means running `benchmark_remote.sh` for the given target on a remote benchmarking machine.
+
+To run benchmarks for a specific target:
+```bash
+cd barretenberg/cpp
+./scripts/benchmark_remote.sh <target_name>
+```
+
+Common benchmark targets:
+- `pippenger_bench` - MSM/Pippenger benchmarks
+- `ultra_honk_bench` - Ultra Honk prover benchmarks
+- `commitment_schemes_bench` - Commitment scheme benchmarks
+
+The remote benchmark script:
+- Runs on a dedicated benchmarking machine for consistent results
+- Automatically builds the target if needed
+- Returns performance metrics and timing data
+- Should be used instead of local benchmarks for performance validation
diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
@@ -169,6 +169,57 @@ template <class Fq, class Fr, class Params> std::ostream& operator<<(std::ostrea
 // constexpr element<Fq, Fr, Params>::one = element<Fq, Fr, Params>{ Params::one_x, Params::one_y, Fq::one() };
 // constexpr element<Fq, Fr, Params>::point_at_infinity = one.set_infinity();
 // constexpr element<Fq, Fr, Params>::curve_b = Params::b;
+
+/**
+ * @brief Memory layout policy for batch affine operations with parallel arrays
+ * @details Layout: (lhs[i], rhs[i]) -> rhs[i] with sequential output (no prefetch needed)
+ */
+struct ParallelArrayPolicy {
+    static constexpr bool ENABLE_PREFETCH = false;
+
+    template <typename AffineElement> static constexpr size_t lhs_index(size_t i) noexcept { return i; }
+
+    template <typename AffineElement> static constexpr size_t rhs_index(size_t i) noexcept { return i; }
+
+    template <typename AffineElement>
+    static constexpr size_t output_index(size_t i, [[maybe_unused]] size_t num_pairs) noexcept
+    {
+        return i;
+    }
+};
+
+/**
+ * @brief Memory layout policy for batch affine operations with interleaved arrays
+ * @details Layout: (points[2i], points[2i+1]) -> points[num_pairs+i] (non-sequential, needs prefetch)
+ */
+struct InterleavedArrayPolicy {
+    static constexpr bool ENABLE_PREFETCH = true;
+
+    template <typename AffineElement> static constexpr size_t lhs_index(size_t i) noexcept { return i * 2; }
+
+    template <typename AffineElement> static constexpr size_t rhs_index(size_t i) noexcept { return (i * 2) + 1; }
+
+    template <typename AffineElement> static constexpr size_t output_index(size_t i, size_t num_pairs) noexcept
+    {
+        return num_pairs + i;
+    }
+
+    template <typename AffineElement, typename Fq>
+    static void prefetch_iteration(const AffineElement* base_points,
+                                   const Fq* scratch,
+                                   size_t i,
+                                   size_t num_pairs) noexcept
+    {
+        if (i >= 1) {
+            size_t prev = i - 1;
+            __builtin_prefetch(&base_points[prev * 2]);
+            __builtin_prefetch(&base_points[(prev * 2) + 1]);
+            __builtin_prefetch(&base_points[num_pairs + prev]);
+            __builtin_prefetch(&scratch[prev]);
+        }
+    }
+};
+
 } // namespace bb::group_elements
 
 #include "./element_impl.hpp"
diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
@@ -711,19 +711,134 @@ element<Fq, Fr, T> element<Fq, Fr, T>::mul_with_endomorphism(const Fr& scalar) c
     return accumulator;
 }
 
+/**
+ * @brief Core batch affine addition using Montgomery's batch inversion trick
+ * @tparam Policy Memory layout policy (ParallelArrayPolicy or InterleavedArrayPolicy)
+ * @tparam AffineElement Affine point type
+ * @tparam Fq Base field type
+ *
+ * @warning ASSUMES NO EDGE CASES:
+ *   - All points must be valid (not point at infinity)
+ *   - lhs[i] != rhs[i] for all i (no point doubling cases)
+ *   - lhs[i] != -rhs[i] for all i (no point at infinity results)
+ *   - Points are linearly independent (generic position)
+ *
+ * @note This is the "unsafe" fast path. For general point addition with edge case handling,
+ *       use Jacobian arithmetic or handle edge cases separately before calling this function.
+ */
+template <typename Policy, typename AffineElement, typename Fq>
+__attribute__((always_inline)) inline void batch_affine_add_impl(const AffineElement* lhs_base,
+                                                                 AffineElement* rhs_base,
+                                                                 const size_t num_pairs,
+                                                                 Fq* scratch_space) noexcept
+{
+    Fq batch_inversion_accumulator = Fq::one();
+
+    // Forward pass: prepare batch inversion
+    for (size_t i = 0; i < num_pairs; ++i) {
+        const AffineElement& lhs = lhs_base[Policy::template lhs_index<AffineElement>(i)];
+        AffineElement& rhs = rhs_base[Policy::template rhs_index<AffineElement>(i)];
+        Fq& scratch = scratch_space[i];
+
+        scratch = lhs.x + rhs.x;
+        rhs.x -= lhs.x;
+        rhs.y -= lhs.y;
+        rhs.y *= batch_inversion_accumulator;
+        batch_inversion_accumulator *= (rhs.x);
+    }
+
+    if (batch_inversion_accumulator == Fq::zero()) {
+        throw_or_abort("attempted to invert zero in batch_affine_add_impl");
+    }
+    batch_inversion_accumulator = batch_inversion_accumulator.invert();
+
+    // Backward pass: compute additions
+    for (size_t i_plus_1 = num_pairs; i_plus_1 > 0; --i_plus_1) {
+        size_t i = i_plus_1 - 1;
+
+        const AffineElement& lhs = lhs_base[Policy::template lhs_index<AffineElement>(i)];
+        AffineElement& rhs = rhs_base[Policy::template rhs_index<AffineElement>(i)];
+        AffineElement& output = rhs_base[Policy::template output_index<AffineElement>(i, num_pairs)];
+        Fq& scratch = scratch_space[i];
+
+        if constexpr (Policy::ENABLE_PREFETCH) {
+            Policy::prefetch_iteration(rhs_base, scratch_space, i, num_pairs);
+        }
+
+        rhs.y *= batch_inversion_accumulator;
+        batch_inversion_accumulator *= rhs.x;
+        rhs.x = rhs.y.sqr();
+        output.x = rhs.x - scratch;
+        scratch = lhs.x - output.x;
+        scratch *= rhs.y;
+        output.y = scratch - lhs.y;
+    }
+}
+
+/**
+ * @brief Batch affine point doubling using Montgomery's trick
+ * @tparam AffineElement Affine point type
+ * @tparam Fq Base field type
+ *
+ * @warning ASSUMES NO EDGE CASES:
+ *   - All points must be valid (not point at infinity)
+ *   - points[i].y != 0 for all i (no vertical tangents)
+ *   - No points with order 2 (where 2P = point at infinity)
+ *
+ * @note This is the "unsafe" fast path. For general point doubling with edge case handling,
+ *       use Jacobian arithmetic or check for edge cases before calling this function.
+ */
+template <typename AffineElement, typename Fq>
+__attribute__((always_inline)) inline void batch_affine_double_impl(AffineElement* points,
+                                                                    const size_t num_points,
+                                                                    Fq* scratch_space) noexcept
+{
+    Fq batch_inversion_accumulator = Fq::one();
+
+    // Forward pass: prepare batch inversion
+    for (size_t i = 0; i < num_points; ++i) {
+        scratch_space[i] = points[i].x.sqr();
+        scratch_space[i] = scratch_space[i] + scratch_space[i] + scratch_space[i];
+        scratch_space[i] *= batch_inversion_accumulator;
+        batch_inversion_accumulator *= (points[i].y + points[i].y);
+    }
+
+    if (batch_inversion_accumulator == Fq::zero()) {
+        throw_or_abort("attempted to invert zero in batch_affine_double_impl");
+    }
+    batch_inversion_accumulator = batch_inversion_accumulator.invert();
+
+    // Backward pass: compute doublings
+    Fq temp_x;
+    for (size_t i_plus_1 = num_points; i_plus_1 > 0; --i_plus_1) {
+        size_t i = i_plus_1 - 1;
+
+        scratch_space[i] *= batch_inversion_accumulator;
+        batch_inversion_accumulator *= (points[i].y + points[i].y);
+
+        temp_x = points[i].x;
+        points[i].x = scratch_space[i].sqr() - (points[i].x + points[i].x);
+        points[i].y = scratch_space[i] * (temp_x - points[i].x) - points[i].y;
+    }
+}
+
 /**
  * @brief Pairwise affine add points in first and second group
  *
- * @param first_group
- * @param second_group
- * @param results
+ * @param first_group Left-hand points
+ * @param second_group Right-hand points
+ * @param results Output array for results[i] = first_group[i] + second_group[i]
+ *
+ * @warning This function does NOT handle edge cases (point at infinity, point doubling, etc.).
+ *          For generic point addition with edge case handling, use Jacobian coordinates instead.
+ *          Only use this when you know points are in generic position (e.g., in Pippenger/MSM).
  */
 template <class Fq, class Fr, class T>
 void element<Fq, Fr, T>::batch_affine_add(const std::span<affine_element<Fq, Fr, T>>& first_group,
                                           const std::span<affine_element<Fq, Fr, T>>& second_group,
                                           const std::span<affine_element<Fq, Fr, T>>& results) noexcept
 {
-    typedef affine_element<Fq, Fr, T> affine_element;
+    using affine_element = affine_element<Fq, Fr, T>;
     const size_t num_points = first_group.size();
     BB_ASSERT_EQ(second_group.size(), first_group.size());
 
@@ -733,51 +848,15 @@ void element<Fq, Fr, T>::batch_affine_add(const std::span<affine_element<Fq, Fr,
     parallel_for_heuristic(
         num_points, [&](size_t i) { results[i] = first_group[i]; }, thread_heuristics::FF_COPY_COST * 2);
 
-    // TODO(#826): Same code as in batch mul
-    //  we can mutate rhs but NOT lhs!
-    //  output is stored in rhs
-    /**
-     * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion
-     *
-     */
-    const auto batch_affine_add_chunked =
-        [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) {
-            Fq batch_inversion_accumulator = Fq::one();
-
-            for (size_t i = 0; i < point_count; i += 1) {
-                personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1
-                rhs[i].x -= lhs[i].x;                            // x2 - x1
-                rhs[i].y -= lhs[i].y;                            // y2 - y1
-                rhs[i].y *= batch_inversion_accumulator;         // (y2 - y1)*accumulator_old
-                batch_inversion_accumulator *= (rhs[i].x);
-            }
-            batch_inversion_accumulator = batch_inversion_accumulator.invert();
-
-            for (size_t i = (point_count)-1; i < point_count; i -= 1) {
-                rhs[i].y *= batch_inversion_accumulator; // update accumulator
-                batch_inversion_accumulator *= rhs[i].x;
-                rhs[i].x = rhs[i].y.sqr();
-                rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2
-                                                                   // - x1
-                personal_scratch_space[i] = lhs[i].x - rhs[i].x;
-                personal_scratch_space[i] *= rhs[i].y;
-                rhs[i].y = personal_scratch_space[i] - lhs[i].y;
-            }
-        };
-
-    /**
-     * @brief Perform batch affine addition in parallel
-     *
-     */
-    const auto batch_affine_add_internal = [&](const affine_element* lhs, affine_element* rhs) {
-        parallel_for_heuristic(
-            num_points,
-            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
-                batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
-            },
-            thread_heuristics::FF_ADDITION_COST * 6 + thread_heuristics::FF_MULTIPLICATION_COST * 6);
-    };
-    batch_affine_add_internal(&second_group[0], &results[0]);
+    // Perform batch affine addition. Uses ParallelArrayPolicy: (lhs[i], rhs[i]) -> rhs[i] with sequential output (no
+    // prefetch)
+    parallel_for_heuristic(
+        num_points,
+        [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
+            batch_affine_add_impl<ParallelArrayPolicy, affine_element, Fq>(
+                &second_group[start], &results[start], end - start, &scratch_space[start]);
+        },
+        thread_heuristics::FF_ADDITION_COST * 6 + thread_heuristics::FF_MULTIPLICATION_COST * 6);
 }
 
 /**
@@ -801,70 +880,6 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
     // Space for temporary values
     std::vector<Fq> scratch_space(num_points);
 
-    // TODO(#826): Same code as in batch add
-    //  we can mutate rhs but NOT lhs!
-    //  output is stored in rhs
-    /**
-     * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion
-     *
-     */
-    const auto batch_affine_add_chunked =
-        [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) {
-            Fq batch_inversion_accumulator = Fq::one();
-
-            for (size_t i = 0; i < point_count; i += 1) {
-                personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1
-                rhs[i].x -= lhs[i].x;                            // x2 - x1
-                rhs[i].y -= lhs[i].y;                            // y2 - y1
-                rhs[i].y *= batch_inversion_accumulator;         // (y2 - y1)*accumulator_old
-                batch_inversion_accumulator *= (rhs[i].x);
-            }
-            batch_inversion_accumulator = batch_inversion_accumulator.invert();
-
-            for (size_t i = (point_count)-1; i < point_count; i -= 1) {
-                rhs[i].y *= batch_inversion_accumulator; // update accumulator
-                batch_inversion_accumulator *= rhs[i].x;
-                rhs[i].x = rhs[i].y.sqr();
-                rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2
-                                                                   // - x1
-                personal_scratch_space[i] = lhs[i].x - rhs[i].x;
-                personal_scratch_space[i] *= rhs[i].y;
-                rhs[i].y = personal_scratch_space[i] - lhs[i].y;
-            }
-        };
-
-    /**
-     * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion
-     *
-     */
-    const auto batch_affine_double_chunked =
-        [](affine_element* lhs, const size_t point_count, Fq* personal_scratch_space) {
-            Fq batch_inversion_accumulator = Fq::one();
-
-            for (size_t i = 0; i < point_count; i += 1) {
-
-                personal_scratch_space[i] = lhs[i].x.sqr();
-                personal_scratch_space[i] =
-                    personal_scratch_space[i] + personal_scratch_space[i] + personal_scratch_space[i];
-
-                personal_scratch_space[i] *= batch_inversion_accumulator;
-
-                batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
-            }
-            batch_inversion_accumulator = batch_inversion_accumulator.invert();
-
-            Fq temp;
-            for (size_t i = (point_count)-1; i < point_count; i -= 1) {
-
-                personal_scratch_space[i] *= batch_inversion_accumulator;
-                batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
-
-                temp = lhs[i].x;
-                lhs[i].x = personal_scratch_space[i].sqr() - (lhs[i].x + lhs[i].x);
-                lhs[i].y = personal_scratch_space[i] * (temp - lhs[i].x) - lhs[i].y;
-            }
-        };
-
     // We compute the resulting point through WNAF by evaluating (the (\sum_i (16ⁱ⋅
     // (a_i ∈ {-15,-13,-11,-9,-7,-5,-3,-1,1,3,5,7,9,11,13,15}))) - skew), where skew is 0 or 1. The result of the sum is
     // always odd and skew is used to reconstruct an even scalar. This means that to construct scalar p-1, where p is
@@ -904,12 +919,13 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
     auto execute_range = [&](size_t start, size_t end) {
         // Perform batch affine addition in parallel
         const auto add_chunked = [&](const affine_element* lhs, affine_element* rhs) {
-            batch_affine_add_chunked(&lhs[start], &rhs[start], end - start, &scratch_space[start]);
+            batch_affine_add_impl<ParallelArrayPolicy, affine_element, Fq>(
+                &lhs[start], &rhs[start], end - start, &scratch_space[start]);
         };
 
         // Perform point doubling in parallel
         const auto double_chunked = [&](affine_element* lhs) {
-            batch_affine_double_chunked(&lhs[start], end - start, &scratch_space[start]);
+            batch_affine_double_impl<affine_element, Fq>(&lhs[start], end - start, &scratch_space[start]);
         };
 
         // Initialize first entries in lookup table