diff --git a/barretenberg/cpp/CLAUDE.md b/barretenberg/cpp/CLAUDE.md index e6d98e4776ca..5c4408d85f16 100644 --- a/barretenberg/cpp/CLAUDE.md +++ b/barretenberg/cpp/CLAUDE.md @@ -103,3 +103,24 @@ BB_VERBOSE=1 yarn-project/scripts/run_test.sh ivc-integration/src/rollup_ivc_int ```` When making barretenberg changes, ensure these tests still pass. + +## Benchmarking: + +**IMPORTANT**: In the barretenberg context, "bench" or "benchmark" almost always means running `benchmark_remote.sh` for the given target on a remote benchmarking machine. + +To run benchmarks for a specific target: +```bash +cd barretenberg/cpp +./scripts/benchmark_remote.sh +``` + +Common benchmark targets: +- `pippenger_bench` - MSM/Pippenger benchmarks +- `ultra_honk_bench` - Ultra Honk prover benchmarks +- `commitment_schemes_bench` - Commitment scheme benchmarks + +The remote benchmark script: +- Runs on a dedicated benchmarking machine for consistent results +- Automatically builds the target if needed +- Returns performance metrics and timing data +- Should be used instead of local benchmarks for performance validation diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp index 5acf88b9655a..57a19d8784a1 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp @@ -169,6 +169,57 @@ template std::ostream& operator<<(std::ostrea // constexpr element::one = element{ Params::one_x, Params::one_y, Fq::one() }; // constexpr element::point_at_infinity = one.set_infinity(); // constexpr element::curve_b = Params::b; + +/** + * @brief Memory layout policy for batch affine operations with parallel arrays + * @details Layout: (lhs[i], rhs[i]) -> rhs[i] with sequential output (no prefetch needed) + */ +struct ParallelArrayPolicy { + static constexpr bool ENABLE_PREFETCH = false; + + template static constexpr size_t lhs_index(size_t i) noexcept { return i; } + + template static constexpr size_t rhs_index(size_t i) noexcept { return i; } + + template + static constexpr size_t output_index(size_t i, [[maybe_unused]] size_t num_pairs) noexcept + { + return i; + } +}; + +/** + * @brief Memory layout policy for batch affine operations with interleaved arrays + * @details Layout: (points[2i], points[2i+1]) -> points[num_pairs+i] (non-sequential, needs prefetch) + */ +struct InterleavedArrayPolicy { + static constexpr bool ENABLE_PREFETCH = true; + + template static constexpr size_t lhs_index(size_t i) noexcept { return i * 2; } + + template static constexpr size_t rhs_index(size_t i) noexcept { return (i * 2) + 1; } + + template static constexpr size_t output_index(size_t i, size_t num_pairs) noexcept + { + return num_pairs + i; + } + + template + static void prefetch_iteration(const AffineElement* base_points, + const Fq* scratch, + size_t i, + size_t num_pairs) noexcept + { + if (i >= 1) { + size_t prev = i - 1; + __builtin_prefetch(&base_points[prev * 2]); + __builtin_prefetch(&base_points[(prev * 2) + 1]); + __builtin_prefetch(&base_points[num_pairs + prev]); + __builtin_prefetch(&scratch[prev]); + } + } +}; + } // namespace bb::group_elements #include "./element_impl.hpp" diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp index 4869b298c690..a6cc23e3ebba 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp @@ -711,19 +711,134 @@ element element::mul_with_endomorphism(const Fr& scalar) c return accumulator; } +/** + * @brief Core batch affine addition using Montgomery's batch inversion trick + * @tparam Policy Memory layout policy (ParallelArrayPolicy or InterleavedArrayPolicy) + * @tparam AffineElement Affine point type + * @tparam Fq Base field type + * + * @warning ASSUMES NO EDGE CASES: + * - All points must be valid (not point at infinity) + * - lhs[i] != rhs[i] for all i (no point doubling cases) + * - lhs[i] != -rhs[i] for all i (no point at infinity results) + * - Points are linearly independent (generic position) + * + * @note This is the "unsafe" fast path. For general point addition with edge case handling, + * use Jacobian arithmetic or handle edge cases separately before calling this function. + */ +template +__attribute__((always_inline)) inline void batch_affine_add_impl(const AffineElement* lhs_base, + AffineElement* rhs_base, + const size_t num_pairs, + Fq* scratch_space) noexcept +{ + Fq batch_inversion_accumulator = Fq::one(); + + // Forward pass: prepare batch inversion + for (size_t i = 0; i < num_pairs; ++i) { + const AffineElement& lhs = lhs_base[Policy::template lhs_index(i)]; + AffineElement& rhs = rhs_base[Policy::template rhs_index(i)]; + Fq& scratch = scratch_space[i]; + + scratch = lhs.x + rhs.x; + rhs.x -= lhs.x; + rhs.y -= lhs.y; + rhs.y *= batch_inversion_accumulator; + batch_inversion_accumulator *= (rhs.x); + } + + if (batch_inversion_accumulator == Fq::zero()) { + throw_or_abort("attempted to invert zero in batch_affine_add_impl"); + } + batch_inversion_accumulator = batch_inversion_accumulator.invert(); + + // Backward pass: compute additions + for (size_t i_plus_1 = num_pairs; i_plus_1 > 0; --i_plus_1) { + size_t i = i_plus_1 - 1; + + const AffineElement& lhs = lhs_base[Policy::template lhs_index(i)]; + AffineElement& rhs = rhs_base[Policy::template rhs_index(i)]; + AffineElement& output = rhs_base[Policy::template output_index(i, num_pairs)]; + Fq& scratch = scratch_space[i]; + + if constexpr (Policy::ENABLE_PREFETCH) { + Policy::prefetch_iteration(rhs_base, scratch_space, i, num_pairs); + } + + rhs.y *= batch_inversion_accumulator; + batch_inversion_accumulator *= rhs.x; + rhs.x = rhs.y.sqr(); + output.x = rhs.x - scratch; + scratch = lhs.x - output.x; + scratch *= rhs.y; + output.y = scratch - lhs.y; + } +} + +/** + * @brief Batch affine point doubling using Montgomery's trick + * @tparam AffineElement Affine point type + * @tparam Fq Base field type + * + * @warning ASSUMES NO EDGE CASES: + * - All points must be valid (not point at infinity) + * - points[i].y != 0 for all i (no vertical tangents) + * - No points with order 2 (where 2P = point at infinity) + * + * @note This is the "unsafe" fast path. For general point doubling with edge case handling, + * use Jacobian arithmetic or check for edge cases before calling this function. + */ +template +__attribute__((always_inline)) inline void batch_affine_double_impl(AffineElement* points, + const size_t num_points, + Fq* scratch_space) noexcept +{ + Fq batch_inversion_accumulator = Fq::one(); + + // Forward pass: prepare batch inversion + for (size_t i = 0; i < num_points; ++i) { + scratch_space[i] = points[i].x.sqr(); + scratch_space[i] = scratch_space[i] + scratch_space[i] + scratch_space[i]; + scratch_space[i] *= batch_inversion_accumulator; + batch_inversion_accumulator *= (points[i].y + points[i].y); + } + + if (batch_inversion_accumulator == Fq::zero()) { + throw_or_abort("attempted to invert zero in batch_affine_double_impl"); + } + batch_inversion_accumulator = batch_inversion_accumulator.invert(); + + // Backward pass: compute doublings + Fq temp_x; + for (size_t i_plus_1 = num_points; i_plus_1 > 0; --i_plus_1) { + size_t i = i_plus_1 - 1; + + scratch_space[i] *= batch_inversion_accumulator; + batch_inversion_accumulator *= (points[i].y + points[i].y); + + temp_x = points[i].x; + points[i].x = scratch_space[i].sqr() - (points[i].x + points[i].x); + points[i].y = scratch_space[i] * (temp_x - points[i].x) - points[i].y; + } +} + /** * @brief Pairwise affine add points in first and second group * - * @param first_group - * @param second_group - * @param results + * @param first_group Left-hand points + * @param second_group Right-hand points + * @param results Output array for results[i] = first_group[i] + second_group[i] + * + * @warning This function does NOT handle edge cases (point at infinity, point doubling, etc.). + * For generic point addition with edge case handling, use Jacobian coordinates instead. + * Only use this when you know points are in generic position (e.g., in Pippenger/MSM). */ template void element::batch_affine_add(const std::span>& first_group, const std::span>& second_group, const std::span>& results) noexcept { - typedef affine_element affine_element; + using affine_element = affine_element; const size_t num_points = first_group.size(); BB_ASSERT_EQ(second_group.size(), first_group.size()); @@ -733,51 +848,15 @@ void element::batch_affine_add(const std::span rhs[i] with sequential output (no + // prefetch) + parallel_for_heuristic( + num_points, + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { + batch_affine_add_impl( + &second_group[start], &results[start], end - start, &scratch_space[start]); + }, + thread_heuristics::FF_ADDITION_COST * 6 + thread_heuristics::FF_MULTIPLICATION_COST * 6); } /** @@ -801,70 +880,6 @@ std::vector> element::batch_mul_with_endomo // Space for temporary values std::vector scratch_space(num_points); - // TODO(#826): Same code as in batch add - // we can mutate rhs but NOT lhs! - // output is stored in rhs - /** - * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion - * - */ - const auto batch_affine_add_chunked = - [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) { - Fq batch_inversion_accumulator = Fq::one(); - - for (size_t i = 0; i < point_count; i += 1) { - personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1 - rhs[i].x -= lhs[i].x; // x2 - x1 - rhs[i].y -= lhs[i].y; // y2 - y1 - rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (rhs[i].x); - } - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - - for (size_t i = (point_count)-1; i < point_count; i -= 1) { - rhs[i].y *= batch_inversion_accumulator; // update accumulator - batch_inversion_accumulator *= rhs[i].x; - rhs[i].x = rhs[i].y.sqr(); - rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2 - // - x1 - personal_scratch_space[i] = lhs[i].x - rhs[i].x; - personal_scratch_space[i] *= rhs[i].y; - rhs[i].y = personal_scratch_space[i] - lhs[i].y; - } - }; - - /** - * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion - * - */ - const auto batch_affine_double_chunked = - [](affine_element* lhs, const size_t point_count, Fq* personal_scratch_space) { - Fq batch_inversion_accumulator = Fq::one(); - - for (size_t i = 0; i < point_count; i += 1) { - - personal_scratch_space[i] = lhs[i].x.sqr(); - personal_scratch_space[i] = - personal_scratch_space[i] + personal_scratch_space[i] + personal_scratch_space[i]; - - personal_scratch_space[i] *= batch_inversion_accumulator; - - batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); - } - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - - Fq temp; - for (size_t i = (point_count)-1; i < point_count; i -= 1) { - - personal_scratch_space[i] *= batch_inversion_accumulator; - batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); - - temp = lhs[i].x; - lhs[i].x = personal_scratch_space[i].sqr() - (lhs[i].x + lhs[i].x); - lhs[i].y = personal_scratch_space[i] * (temp - lhs[i].x) - lhs[i].y; - } - }; - // We compute the resulting point through WNAF by evaluating (the (\sum_i (16ⁱ⋅ // (a_i ∈ {-15,-13,-11,-9,-7,-5,-3,-1,1,3,5,7,9,11,13,15}))) - skew), where skew is 0 or 1. The result of the sum is // always odd and skew is used to reconstruct an even scalar. This means that to construct scalar p-1, where p is @@ -904,12 +919,13 @@ std::vector> element::batch_mul_with_endomo auto execute_range = [&](size_t start, size_t end) { // Perform batch affine addition in parallel const auto add_chunked = [&](const affine_element* lhs, affine_element* rhs) { - batch_affine_add_chunked(&lhs[start], &rhs[start], end - start, &scratch_space[start]); + batch_affine_add_impl( + &lhs[start], &rhs[start], end - start, &scratch_space[start]); }; // Perform point doubling in parallel const auto double_chunked = [&](affine_element* lhs) { - batch_affine_double_chunked(&lhs[start], end - start, &scratch_space[start]); + batch_affine_double_impl(&lhs[start], end - start, &scratch_space[start]); }; // Initialize first entries in lookup table diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp index 7e73e8ddbe66..18a4d6342b5b 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp @@ -240,47 +240,14 @@ void MSM::add_affine_points(typename Curve::AffineElement* points, const size_t num_points, typename Curve::BaseField* scratch_space) noexcept { - using Fq = typename Curve::BaseField; - Fq batch_inversion_accumulator = Fq::one(); - - // Forward pass: prepare batch inversion inputs. - // We reuse points[i+1] storage: .x stores (x2-x1), .y stores (y2-y1)*accumulator - for (size_t i = 0; i < num_points; i += 2) { - scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 (needed later for x3) - points[i + 1].x -= points[i].x; // x2 - x1 (denominator for lambda) - points[i + 1].y -= points[i].y; // y2 - y1 (numerator for lambda) - points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (points[i + 1].x); // accumulate denominators - } - if (batch_inversion_accumulator == 0) { - // prefer abort to throw for code that might emit from multiple threads - throw_or_abort("attempted to invert zero in add_affine_points"); - } else { - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - } - - // Backward pass: compute additions using batch inversion results. - // Reusing points[i+1] storage: .y becomes lambda, .x becomes lambda^2. - // Results are written to the top half of points array: points[(i+num_points)/2]. - // Loop terminates when i underflows (becomes > num_points for unsigned). - for (size_t i = (num_points)-2; i < num_points; i -= 2) { - points[i + 1].y *= batch_inversion_accumulator; // .y now holds lambda = (y2-y1)/(x2-x1) - batch_inversion_accumulator *= points[i + 1].x; // restore accumulator for next iteration - points[i + 1].x = points[i + 1].y.sqr(); // .x now holds lambda^2 - points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda^2 - x2 - x1 - // Output addresses jump non-sequentially: points[(i+n)>>1] defeats hardware prefetcher. - // Fetching 2 iterations ahead ensures data arrives before needed. - if (i >= 2) { - __builtin_prefetch(points + i - 2); - __builtin_prefetch(points + i - 1); - __builtin_prefetch(points + ((i + num_points - 2) >> 1)); - __builtin_prefetch(scratch_space + ((i - 2) >> 1)); - } - // Compute y3 = lambda * (x1 - x3) - y1, reusing points[i].x as temp storage - points[i].x -= points[(i + num_points) >> 1].x; // x1 - x3 - points[i].x *= points[i + 1].y; // lambda * (x1 - x3) - points[(i + num_points) >> 1].y = points[i].x - points[i].y; // y3 = lambda*(x1-x3) - y1 - } + using AffineElement = typename Curve::AffineElement; + using BaseField = typename Curve::BaseField; + + // Use interleaved array policy: pairs are (points[2i], points[2i+1]), output in points[num_pairs + 1] + // This includes prefetching for non-sequential output access + const size_t num_pairs = num_points / 2; + bb::group_elements::batch_affine_add_impl( + points, points, num_pairs, scratch_space); } template