Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7811d58
Add commitment timing to bench
ledwards2225 Jul 10, 2024
958a682
constructing an srs view and computing commitments on reduced inputs
ledwards2225 Jul 11, 2024
c62842b
add commit bench from my other branch
ledwards2225 Jul 15, 2024
6e940fd
clean up commit bench a bit
ledwards2225 Jul 15, 2024
170ac63
commit_sparse method with test and benchmark
ledwards2225 Jul 15, 2024
c8055e0
multithreaded commit sparse
ledwards2225 Jul 15, 2024
2cdd476
use comit sparse in oink
ledwards2225 Jul 15, 2024
e8c3002
Merge branch 'master' into lde/analyze_commitments
ledwards2225 Jul 15, 2024
5b3926a
cleanup
ledwards2225 Jul 16, 2024
df99d1d
clean out comit key test suite
ledwards2225 Jul 16, 2024
0696f08
fix gcc
ledwards2225 Jul 16, 2024
75fbea8
improve commit sparse and add comments
ledwards2225 Jul 16, 2024
a339ff5
clean up commit bench
ledwards2225 Jul 16, 2024
9fecf59
Merge branch 'master' into lde/analyze_commitments
ledwards2225 Jul 16, 2024
d89771f
clean and comment
ledwards2225 Jul 16, 2024
ff801fe
fix endo point from debugging
ledwards2225 Jul 16, 2024
6139b4a
update test
ledwards2225 Jul 16, 2024
cd3ed2f
Merge branch 'master' into lde/analyze_commitments
ledwards2225 Jul 16, 2024
e503e55
cleanup and naming
ledwards2225 Jul 16, 2024
1789be2
Merge branch 'master' into lde/analyze_commitments
ledwards2225 Jul 17, 2024
982480d
update commit bench with larger polys
ledwards2225 Jul 18, 2024
199faca
Merge branch 'master' into lde/analyze_commitments
ledwards2225 Jul 22, 2024
3ee1bab
reserve space in vectors
ledwards2225 Jul 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 45 additions & 23 deletions barretenberg/cpp/scripts/analyze_client_ivc_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
PREFIX = Path("build-op-count-time")
IVC_BENCH_JSON = Path("client_ivc_bench.json")
BENCHMARK = "ClientIVCBench/Full/6"
# BENCHMARK = "ClientIVCBench/FullStructured/6"

# Single out an independent set of functions accounting for most of BENCHMARK's real_time
to_keep = [
Expand Down Expand Up @@ -71,10 +72,35 @@
print(f"{key:<{max_label_length}}{time_ms:>8.0f} {time_ms/total_time_ms:>8.2%}")


# Relations breakdown
# Note: The timings here are off likely because the tracking is occuring in a hot loop but
# they should be meaningful relative to one another
print('\nRelation contributions (times to be interpreted relatively):')
# Extract a set of components from the benchmark data and display timings and relative percentages
def print_contributions(prefix, ivc_bench_json, bench_name, components):

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was logic previously used only to process bench data for the Relations but I wanted to reuse it for commitments so I just made it a method


# Read JSON file and extract benchmark
try:
with open(prefix / ivc_bench_json, "r") as read_file:
read_result = json.load(read_file)
bench = next((_bench for _bench in read_result["benchmarks"] if _bench["name"] == bench_name), None)
if not bench:
raise ValueError(f"Benchmark '{bench_name}' not found in the JSON file.")
except FileNotFoundError:
print(f"File not found: {prefix / ivc_bench_json}")
return

# Filter and sum up kept times
bench_components = {key: bench[key] for key in components if key in bench}
sum_of_kept_times_ms = sum(float(time) for time in bench_components.values()) / 1e6
print(f"Total time accounted for (ms): {sum_of_kept_times_ms:>8.0f}")

# Print results
max_label_length = max(len(label) for label in components)
column_headers = {"operation": "operation", "ms": "ms", "%": "% sum"}
print(f"{column_headers['operation']:<{max_label_length}}{column_headers['ms']:>8} {column_headers['%']:>8}")

for key in components:
time_ms = bench_components.get(key, 0) / 1e6
percentage = time_ms / sum_of_kept_times_ms if sum_of_kept_times_ms > 0 else 0
print(f"{key:<{max_label_length}}{time_ms:>8.0f} {percentage:>8.2%}")

relations = [
"Arithmetic::accumulate(t)",
"Permutation::accumulate(t)",
Expand All @@ -87,23 +113,19 @@
"PoseidonExt::accumulate(t)",
"PoseidonInt::accumulate(t)",
]
with open(PREFIX/IVC_BENCH_JSON, "r") as read_file:
read_result = json.load(read_file)
for _bench in read_result["benchmarks"]:
if _bench["name"] == BENCHMARK:
bench = _bench
bench_components = dict(filter(lambda x: x[0] in relations, bench.items()))

# For each kept time, get the proportion over all kept times.
sum_of_kept_times_ms = sum(float(time)
for _, time in bench_components.items())/1e6
max_label_length = max(len(label) for label in relations)
column = {"function": "function", "ms": "ms", "%": "% sum"}
print(
f"{column['function']:<{max_label_length}}{column['ms']:>8} {column['%']:>8}")
for key in relations:
if key not in bench:
time_ms = 0
else:
time_ms = bench[key]/1e6
print(f"{key:<{max_label_length}}{time_ms:>8.0f} {time_ms/sum_of_kept_times_ms:>8.2%}")
print('\nRelation contributions (times to be interpreted relatively):')
print_contributions(PREFIX, IVC_BENCH_JSON, BENCHMARK, relations)

commitments = [
"COMMIT::wires(t)",
"COMMIT::z_perm(t)",
"COMMIT::databus(t)",
"COMMIT::ecc_op_wires(t)",
"COMMIT::lookup_inverses(t)",
"COMMIT::databus_inverses(t)",
"COMMIT::lookup_counts_tags(t)",
]

print('\nCommitment contributions:')
print_contributions(PREFIX, IVC_BENCH_JSON, BENCHMARK, commitments)
1 change: 1 addition & 0 deletions barretenberg/cpp/scripts/benchmark_client_ivc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set -eu
TARGET="client_ivc_bench"
# Note: to run structured trace version, change "Full" to "FullStructured" here and in analyze script
FILTER="ClientIVCBench/Full/6$"
# FILTER="ClientIVCBench/FullStructured/6$"
BUILD_DIR=build-op-count-time

# Move above script dir.
Expand Down
140 changes: 134 additions & 6 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/commit.bench.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,159 @@

#include "barretenberg/commitment_schemes/commitment_key.hpp"
#include "barretenberg/common/zip_view.hpp"
#include "barretenberg/srs/factories/mem_bn254_crs_factory.hpp"
#include <algorithm>
#include <benchmark/benchmark.h>
#include <iostream>
#include <ranges>
#include <vector>

namespace bb {

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This suite was previously just benchmarking committing to zero polynomials of various sizes. I'm assuming it was just a WiP and never used but who knows. I've updated it to include a number of different scenarios, including committing various types of sparse polynomials with the traditional commit() and the new commit_sparse()

template <typename Curve> std::shared_ptr<CommitmentKey<Curve>> create_commitment_key(const size_t num_points)
{
bb::srs::init_crs_factory("../srs_db/ignition");
std::string srs_path;
return std::make_shared<CommitmentKey<Curve>>(num_points);
}

constexpr size_t MAX_LOG_NUM_POINTS = 24;
constexpr size_t MAX_NUM_POINTS = 1 << MAX_LOG_NUM_POINTS;
template <typename FF> Polynomial<FF> sparse_random_poly(const size_t size, const size_t num_nonzero)
{
auto polynomial = Polynomial<FF>(size);

for (size_t i = 0; i < num_nonzero; i++) {
polynomial[i] = FF::random_element();
}

auto key = create_commitment_key<curve::BN254>(MAX_NUM_POINTS);
return polynomial;
}

template <typename Curve> void bench_commit(::benchmark::State& state)
template <typename Curve>
std::vector<typename Curve::AffineElement> extract_srs(std::shared_ptr<CommitmentKey<Curve>> commitment_key,
const size_t num_points)
{
using G1 = typename Curve::AffineElement;
std::vector<G1> monomials;
size_t idx = 0;
std::span<G1> point_table(commitment_key->srs->get_monomial_points(), 2 * num_points);

for (auto& element : point_table) {
if (idx % 2 == 0) {
monomials.emplace_back(element);
}
idx++;
if (monomials.size() == num_points) {
break;
}
}
return monomials;
}

constexpr size_t MAX_LOG_NUM_POINTS = 18;
constexpr size_t MAX_NUM_POINTS = 1 << MAX_LOG_NUM_POINTS;
Comment on lines +30 to +31

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW we prob do care about 2^24 since that's the ClientIVC recursive verifier size right now.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah easy enough to change for an interested party but I was running these a lot and anything larger was taking too long


template <typename Curve> void bench_commit_zero(::benchmark::State& state)
{
auto key = create_commitment_key<Curve>(MAX_NUM_POINTS);

const size_t num_points = 1 << state.range(0);
const auto polynomial = Polynomial<typename Curve::ScalarField>(num_points);
for (auto _ : state) {
benchmark::DoNotOptimize(key->commit(polynomial));
key->commit(polynomial);
}
}

template <typename Curve> void bench_commit_sparse(::benchmark::State& state)
{
using Fr = typename Curve::ScalarField;
auto key = create_commitment_key<Curve>(MAX_NUM_POINTS);

const size_t num_points = 1 << state.range(0);
const size_t num_nonzero = 2;

auto polynomial = Polynomial<Fr>(num_points);
for (size_t i = 0; i < num_nonzero; i++) {
polynomial[i] = 1;
}

for (auto _ : state) {
key->commit(polynomial);
}
}

template <typename Curve> void bench_commit_sparse_preprocessed(::benchmark::State& state)
{
using Fr = typename Curve::ScalarField;
auto key = create_commitment_key<Curve>(MAX_NUM_POINTS);

const size_t num_points = 1 << state.range(0);
const size_t num_nonzero = 2;

auto polynomial = Polynomial<Fr>(num_points);
for (size_t i = 0; i < num_nonzero; i++) {
polynomial[i] = 1;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a follow-on question: if these are at random locations, do we have the same performance?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the random sparse polys (i.e. the ones for which I was showing bench results) the locations are randomized, but I did not see a difference before and after randomizing the locations.

}

for (auto _ : state) {
key->commit_sparse(polynomial);
}
}

template <typename Curve> void bench_commit_sparse_random(::benchmark::State& state)
{
using Fr = typename Curve::ScalarField;
auto key = create_commitment_key<Curve>(MAX_NUM_POINTS);

const size_t num_points = 1 << state.range(0);
const size_t num_nonzero = 5;

auto polynomial = sparse_random_poly<Fr>(num_points, num_nonzero);

for (auto _ : state) {
key->commit(polynomial);
}
}

template <typename Curve> void bench_commit_sparse_random_preprocessed(::benchmark::State& state)
{
using Fr = typename Curve::ScalarField;
auto key = create_commitment_key<Curve>(MAX_NUM_POINTS);

const size_t num_points = 1 << state.range(0);
const size_t num_nonzero = 5;

auto polynomial = sparse_random_poly<Fr>(num_points, num_nonzero);

for (auto _ : state) {
key->commit_sparse(polynomial);
}
}

template <typename Curve> void bench_commit_random(::benchmark::State& state)
{
using Fr = typename Curve::ScalarField;
auto key = create_commitment_key<Curve>(MAX_NUM_POINTS);

const size_t num_points = 1 << state.range(0);
auto polynomial = Polynomial<Fr>(num_points);
for (auto& coeff : polynomial) {
coeff = Fr::random_element();
}
for (auto _ : state) {
key->commit(polynomial);
}
}

BENCHMARK(bench_commit<curve::BN254>)->DenseRange(10, MAX_LOG_NUM_POINTS)->Unit(benchmark::kMillisecond);
BENCHMARK(bench_commit_zero<curve::BN254>)->DenseRange(14, MAX_LOG_NUM_POINTS)->Unit(benchmark::kMillisecond);
BENCHMARK(bench_commit_sparse<curve::BN254>)->DenseRange(14, MAX_LOG_NUM_POINTS)->Unit(benchmark::kMillisecond);
BENCHMARK(bench_commit_sparse_preprocessed<curve::BN254>)
->DenseRange(14, MAX_LOG_NUM_POINTS)
->Unit(benchmark::kMillisecond);
BENCHMARK(bench_commit_sparse_random<curve::BN254>)->DenseRange(14, MAX_LOG_NUM_POINTS)->Unit(benchmark::kMillisecond);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NB the weird asymptotics including a sudden huge jump.

BENCHMARK(bench_commit_sparse_random_preprocessed<curve::BN254>)
->DenseRange(14, MAX_LOG_NUM_POINTS)
->Unit(benchmark::kMillisecond);
BENCHMARK(bench_commit_random<curve::BN254>)->DenseRange(14, MAX_LOG_NUM_POINTS)->Unit(benchmark::kMillisecond);

} // namespace bb

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cstddef>
#include <memory>
#include <ranges>
#include <string_view>

namespace bb {
Expand All @@ -34,6 +35,7 @@ template <class Curve> class CommitmentKey {

using Fr = typename Curve::ScalarField;
using Commitment = typename Curve::AffineElement;
using G1 = typename Curve::AffineElement;

public:
scalar_multiplication::pippenger_runtime_state<Curve> pippenger_runtime_state;
Expand Down Expand Up @@ -81,6 +83,47 @@ template <class Curve> class CommitmentKey {
return scalar_multiplication::pippenger_unsafe<Curve>(
const_cast<Fr*>(polynomial.data()), srs->get_monomial_points(), degree, pippenger_runtime_state);
};

Commitment commit_sparse(std::span<const Fr> polynomial)
{
// BB_OP_COUNT_TIME();
const size_t degree = polynomial.size();
ASSERT(degree <= srs->get_monomial_size());

G1* point_table = srs->get_monomial_points();

std::vector<Fr> scalars;
std::vector<G1> points;

const size_t num_threads = degree >= get_num_cpus_pow2() ? get_num_cpus_pow2() : 1;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is weird for small degree but we don't care in practice?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC these shouldn't (aren't) the number of threads, just the number of things you need to iterate. If you look closely at the current implementation of parallel_for, the number of threads are not sth the caller can choose, you can only choose the number of elements to iterate (which is right):

void parallel_for_mutex_pool(size_t num_iterations, const std::function<void(size_t)>& func)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I'm a little unclear what your suggestion is here. Is your objection about the name num_threads? I guess I'm thinking of it as shorthand for "num threads over which to distribute the work", which is how this value is used if I'm not mistaken. Admittedly though I am being a bit sloppy here in that I don't really need a power-of-2 thread count and my condition for when not to multithread is a bit arbitrary. In practice though the only context that matters is degree ~2^18 and higher so not so important to get the small values right

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I mean is that the value passed to parallel_for is in principle not the number of threads. If you have 16 cores and do parallel_for(200, some_f) then it will be chunked in 200/16 chunks over 16 threads, and f will be called from 0 to 199.

Maybe the way you set it up does make it coincide with the number of threads though. @ludamad would be the best to ask. I'm commenting because I had to use parallel_for recently and noticed this subtlety.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah I see what you mean. In my case the "num_iterations" input will always be <= actual num threads but your point stands. I don't really love that - seems like the parallel_for interface should allow you to specify how it should multithread.

const size_t block_size = degree / num_threads;

std::vector<std::vector<Fr>> thread_scalars(num_threads);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My above comment also means that you might get fewer real threads than the number of elements. IIUC, this means that your division of vectors here is still thread-safe, but less efficient than you might think (i.e., not one "bucket" per thread).

std::vector<std::vector<G1>> thread_points(num_threads);

parallel_for(num_threads, [&](size_t thread_idx) {
const size_t start = thread_idx * block_size;
const size_t end = (thread_idx + 1) * block_size;

for (size_t idx = start; idx < end; ++idx) {

const G1& point = point_table[idx * 2];
const Fr& scalar = polynomial[idx];
if (!scalar.is_zero()) {
thread_scalars[thread_idx].emplace_back(scalar);
thread_points[thread_idx].emplace_back(point);
}
}
});

for (size_t idx = 0; idx < num_threads; ++idx) {
scalars.insert(scalars.end(), thread_scalars[idx].begin(), thread_scalars[idx].end());
points.insert(points.end(), thread_points[idx].begin(), thread_points[idx].end());
}

return scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
scalars.data(), points.data(), scalars.size(), pippenger_runtime_state);
}
};

} // namespace bb
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include "barretenberg/commitment_schemes/commitment_key.hpp"
#include "barretenberg/polynomials/polynomial.hpp"
#include "barretenberg/srs/factories/file_crs_factory.hpp"

#include <gtest/gtest.h>

namespace bb {

template <typename Curve> class CommitmentKeyTest : public ::testing::Test {
using CK = CommitmentKey<Curve>;

using Fr = typename Curve::ScalarField;
using Commitment = typename Curve::AffineElement;
using Polynomial = bb::Polynomial<Fr>;

public:
template <class CK> inline std::shared_ptr<CK> create_commitment_key(size_t num_points);
};

template <>
template <>
std::shared_ptr<CommitmentKey<curve::BN254>> CommitmentKeyTest<curve::BN254>::create_commitment_key<
CommitmentKey<curve::BN254>>(const size_t num_points)
{
srs::init_crs_factory("../srs_db/ignition");
return std::make_shared<CommitmentKey<curve::BN254>>(num_points);
}

template <>
template <>
std::shared_ptr<CommitmentKey<curve::Grumpkin>> CommitmentKeyTest<curve::Grumpkin>::create_commitment_key<
CommitmentKey<curve::Grumpkin>>(const size_t num_points)
{
srs::init_grumpkin_crs_factory("../srs_db/grumpkin");
return std::make_shared<CommitmentKey<curve::Grumpkin>>(num_points);
}

using Curves = ::testing::Types<curve::BN254, curve::Grumpkin>;

TYPED_TEST_SUITE(CommitmentKeyTest, Curves);

// Check that commit and commit_sparse return the same result for a random sparse polynomial
TYPED_TEST(CommitmentKeyTest, CommitSparse)
{
using Curve = TypeParam;
using CK = CommitmentKey<Curve>;
using G1 = Curve::AffineElement;
using Fr = Curve::ScalarField;
using Polynomial = bb::Polynomial<Fr>;

const size_t num_points = 1 << 10;
const size_t num_nonzero = 7;

// Construct a sparse random polynomial
Polynomial poly{ num_points };
for (size_t i = 0; i < num_nonzero; ++i) {
size_t idx = (i + 1) * (i + 1) % num_points;
poly[idx] = Fr::random_element();
}

// Commit to the polynomial using both the conventional commit method and the sparse commitment method
auto key = TestFixture::template create_commitment_key<CK>(num_points);
G1 commit_result = key->commit(poly);
G1 sparse_commit_result = key->commit_sparse(poly);

EXPECT_EQ(sparse_commit_result, commit_result);
}

} // namespace bb
Loading