From 8903c061bb402ae50fdf8b8624d457cf78cc98db Mon Sep 17 00:00:00 2001 From: maramihali Date: Fri, 29 Nov 2024 12:05:26 +0000 Subject: [PATCH 1/2] parallelise coefficient tree level structure allocation --- .../cpp/scripts/analyze_client_ivc_bench.py | 8 +- .../client_ivc_bench/client_ivc.bench.cpp | 2 +- .../protogalaxy_prover_internal.hpp | 94 ++++++++++++++----- 3 files changed, 75 insertions(+), 29 deletions(-) diff --git a/barretenberg/cpp/scripts/analyze_client_ivc_bench.py b/barretenberg/cpp/scripts/analyze_client_ivc_bench.py index 600854679c5e..28213fc16d98 100755 --- a/barretenberg/cpp/scripts/analyze_client_ivc_bench.py +++ b/barretenberg/cpp/scripts/analyze_client_ivc_bench.py @@ -16,13 +16,17 @@ # Single out an independent set of functions accounting for most of BENCHMARK's real_time to_keep = [ "construct_circuits(t)", - "DeciderProvingKey(Circuit&)(t)", + # "DeciderProvingKey(Circuit&)(t)", "ProtogalaxyProver::prove(t)", + "initialise coefficient tree level(t)", + "ProtogalaxyProver_::compute_row_evaluations(t)", + "ProtogalaxyProver_::construct_perturbator_coefficients(t)", + "ProtogalaxyProver_::construct_coefficients_tree(t)", "Decider::construct_proof(t)", "ECCVMProver(CircuitBuilder&)(t)", "ECCVMProver::construct_proof(t)", "TranslatorProver::construct_proof(t)", - "Goblin::merge(t)" + # "Goblin::merge(t)" ] with open(PREFIX / IVC_BENCH_JSON, "r") as read_file: diff --git a/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp index 6033648972c4..7aa1b8910d83 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp @@ -33,7 +33,7 @@ class ClientIVCBench : public benchmark::Fixture { */ BENCHMARK_DEFINE_F(ClientIVCBench, Full)(benchmark::State& state) { - ClientIVC ivc{ { CLIENT_IVC_BENCH_STRUCTURE } }; + ClientIVC ivc{ { EXAMPLE_20 } }; auto total_num_circuits = 2 * static_cast(state.range(0)); // 2x accounts for kernel circuits auto mocked_vkeys = mock_verification_keys(total_num_circuits); diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp index efad2d0e0726..7041ea8cd382 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp @@ -161,6 +161,25 @@ template class ProtogalaxyProverInternal { return aggregated_relation_evaluations; } + + static std::vector> initialise_coefficient_tree_level(const size_t level_width, const size_t degree) + { + PROFILE_THIS_NAME("initialise coefficient tree level"); + std::vector> level_coeffs(level_width); + size_t num_threads = calculate_num_threads(level_width); + size_t range_per_thread = level_width / num_threads; + size_t leftovers = level_width - (range_per_thread * num_threads); + parallel_for(num_threads, [&](size_t j) { + size_t offset = j * range_per_thread; + size_t range = (j == num_threads - 1) ? range_per_thread + leftovers : range_per_thread; + ASSERT(offset < level_width || level_width == 0); + ASSERT((offset + range) <= level_width); + for (size_t idx = offset; idx < offset + range; idx++) { + level_coeffs[idx].resize(degree + 1); + } + }); + return level_coeffs; + } /** * @brief Recursively compute the parent nodes of each level in the tree, starting from the leaves. Note that at * each level, the resulting parent nodes will be polynomials of degree (level+1) because we multiply by an @@ -171,24 +190,28 @@ template class ProtogalaxyProverInternal { const std::vector>& prev_level_coeffs, size_t level = 1) { + if (level == betas.size()) { return prev_level_coeffs[0]; } - - auto degree = level + 1; - auto prev_level_width = prev_level_coeffs.size(); - std::vector> level_coeffs(prev_level_width / 2, std::vector(degree + 1, 0)); - parallel_for_heuristic( - prev_level_width / 2, - [&](size_t parent) { - size_t node = parent * 2; - std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin()); - for (size_t d = 0; d < degree; d++) { - level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level]; - level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level]; - } - }, - /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3); + const size_t degree = level + 1; + const size_t level_width = prev_level_coeffs.size() / 2; + std::vector> level_coeffs = initialise_coefficient_tree_level(level_width, degree); + { + PROFILE_THIS_NAME("other coefficients tree computation"); + parallel_for_heuristic( + level_width, + [&](size_t parent) { + size_t node = parent * 2; + std::copy( + prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin()); + for (size_t d = 0; d < degree; d++) { + level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level]; + level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level]; + } + }, + /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3); + } return construct_coefficients_tree(betas, deltas, level_coeffs, level + 1); } @@ -206,17 +229,36 @@ template class ProtogalaxyProverInternal { std::span deltas, const std::vector& full_honk_evaluations) { - auto width = full_honk_evaluations.size(); - std::vector> first_level_coeffs(width / 2, std::vector(2, 0)); - parallel_for_heuristic( - width / 2, - [&](size_t parent) { - size_t node = parent * 2; - first_level_coeffs[parent][0] = - full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0]; - first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0]; - }, - /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3); + + const size_t width = full_honk_evaluations.size() / 2; + std::vector> first_level_coeffs = initialise_coefficient_tree_level(width, 1); + // { + // PROFILE_THIS_NAME("first level coefficients allocation"); + // size_t num_threads = calculate_num_threads(width); + // size_t range_per_thread = width / num_threads; + // size_t leftovers = width - (range_per_thread * num_threads); + // parallel_for(num_threads, [&](size_t j) { + // size_t offset = j * range_per_thread; + // size_t range = (j == num_threads - 1) ? range_per_thread + leftovers : range_per_thread; + // ASSERT(offset < width || width == 0); + // ASSERT((offset + range) <= width); + // for (size_t idx = offset; idx < offset + range; idx++) { + // first_level_coeffs[idx].resize(2); + // } + // }); + // } + { + PROFILE_THIS_NAME("perturbator coefficients first level computation"); + parallel_for_heuristic( + width, + [&](size_t parent) { + size_t node = parent * 2; + first_level_coeffs[parent][0] = + full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0]; + first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0]; + }, + /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3); + } return construct_coefficients_tree(betas, deltas, first_level_coeffs); } From 005e588b6efffb2c6452bd201d0b737ce2c50aba Mon Sep 17 00:00:00 2001 From: maramihali Date: Fri, 29 Nov 2024 12:24:50 +0000 Subject: [PATCH 2/2] cleanup --- .../cpp/scripts/analyze_client_ivc_bench.py | 8 +--- .../client_ivc_bench/client_ivc.bench.cpp | 2 +- .../protogalaxy_prover_internal.hpp | 38 +++++++++---------- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/barretenberg/cpp/scripts/analyze_client_ivc_bench.py b/barretenberg/cpp/scripts/analyze_client_ivc_bench.py index 28213fc16d98..600854679c5e 100755 --- a/barretenberg/cpp/scripts/analyze_client_ivc_bench.py +++ b/barretenberg/cpp/scripts/analyze_client_ivc_bench.py @@ -16,17 +16,13 @@ # Single out an independent set of functions accounting for most of BENCHMARK's real_time to_keep = [ "construct_circuits(t)", - # "DeciderProvingKey(Circuit&)(t)", + "DeciderProvingKey(Circuit&)(t)", "ProtogalaxyProver::prove(t)", - "initialise coefficient tree level(t)", - "ProtogalaxyProver_::compute_row_evaluations(t)", - "ProtogalaxyProver_::construct_perturbator_coefficients(t)", - "ProtogalaxyProver_::construct_coefficients_tree(t)", "Decider::construct_proof(t)", "ECCVMProver(CircuitBuilder&)(t)", "ECCVMProver::construct_proof(t)", "TranslatorProver::construct_proof(t)", - # "Goblin::merge(t)" + "Goblin::merge(t)" ] with open(PREFIX / IVC_BENCH_JSON, "r") as read_file: diff --git a/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp index 7aa1b8910d83..6033648972c4 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp @@ -33,7 +33,7 @@ class ClientIVCBench : public benchmark::Fixture { */ BENCHMARK_DEFINE_F(ClientIVCBench, Full)(benchmark::State& state) { - ClientIVC ivc{ { EXAMPLE_20 } }; + ClientIVC ivc{ { CLIENT_IVC_BENCH_STRUCTURE } }; auto total_num_circuits = 2 * static_cast(state.range(0)); // 2x accounts for kernel circuits auto mocked_vkeys = mock_verification_keys(total_num_circuits); diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp index 7041ea8cd382..c635da8cfa25 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp @@ -162,19 +162,30 @@ template class ProtogalaxyProverInternal { return aggregated_relation_evaluations; } + /** + * @brief Initialise the data structured storing a set of nodes at a given level, in parallel if the width is + * sufficiently big + * + * @param level_width determines the number of nodes for the given level + * @param degree determines the degree of the polynomial stored in each node, the number of elements will be + * degree+1 + * + * @return std::vector> + */ static std::vector> initialise_coefficient_tree_level(const size_t level_width, const size_t degree) { PROFILE_THIS_NAME("initialise coefficient tree level"); std::vector> level_coeffs(level_width); - size_t num_threads = calculate_num_threads(level_width); - size_t range_per_thread = level_width / num_threads; - size_t leftovers = level_width - (range_per_thread * num_threads); + const size_t num_threads = calculate_num_threads(level_width); + const size_t range_per_thread = level_width / num_threads; + const size_t leftovers = level_width - (range_per_thread * num_threads); parallel_for(num_threads, [&](size_t j) { - size_t offset = j * range_per_thread; - size_t range = (j == num_threads - 1) ? range_per_thread + leftovers : range_per_thread; + const size_t offset = j * range_per_thread; + const size_t range = (j == num_threads - 1) ? range_per_thread + leftovers : range_per_thread; ASSERT(offset < level_width || level_width == 0); ASSERT((offset + range) <= level_width); for (size_t idx = offset; idx < offset + range; idx++) { + // Representing a polynomial of a certain degree requires degree + 1 coefficients level_coeffs[idx].resize(degree + 1); } }); @@ -232,27 +243,12 @@ template class ProtogalaxyProverInternal { const size_t width = full_honk_evaluations.size() / 2; std::vector> first_level_coeffs = initialise_coefficient_tree_level(width, 1); - // { - // PROFILE_THIS_NAME("first level coefficients allocation"); - // size_t num_threads = calculate_num_threads(width); - // size_t range_per_thread = width / num_threads; - // size_t leftovers = width - (range_per_thread * num_threads); - // parallel_for(num_threads, [&](size_t j) { - // size_t offset = j * range_per_thread; - // size_t range = (j == num_threads - 1) ? range_per_thread + leftovers : range_per_thread; - // ASSERT(offset < width || width == 0); - // ASSERT((offset + range) <= width); - // for (size_t idx = offset; idx < offset + range; idx++) { - // first_level_coeffs[idx].resize(2); - // } - // }); - // } { PROFILE_THIS_NAME("perturbator coefficients first level computation"); parallel_for_heuristic( width, [&](size_t parent) { - size_t node = parent * 2; + const size_t node = parent * 2; first_level_coeffs[parent][0] = full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0]; first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0];