diff --git a/barretenberg/cpp/CLAUDE.md b/barretenberg/cpp/CLAUDE.md
index dbe3aaa51ad1..e6d98e4776ca 100644
--- a/barretenberg/cpp/CLAUDE.md
+++ b/barretenberg/cpp/CLAUDE.md
@@ -2,6 +2,14 @@ succint aztec-packages cheat sheet.
THE PROJECT ROOT IS AT TWO LEVELS ABOVE THIS FOLDER. Typically, the repository is at ~/aztec-packages. all advice is from the root.
+# Git workflow for barretenberg
+
+**IMPORTANT**: When comparing branches or looking at diffs for barretenberg work, use `merge-train/barretenberg` as the base branch, NOT `master`. The master branch is often outdated for barretenberg development.
+
+Examples:
+- `git diff merge-train/barretenberg...HEAD` (not `git diff master...HEAD`)
+- `git log merge-train/barretenberg..HEAD` (not `git log master..HEAD`)
+
Run ./bootstrap.sh at the top-level to be sure the repo fully builds.
Bootstrap scripts can be called with relative paths e.g. ../barretenberg/bootstrap.sh
diff --git a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
index 630d928089d0..748bb978fef4 100755
--- a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
+++ b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
@@ -16,7 +16,7 @@ PRESET=${3:-clang20}
BUILD_DIR=${4:-build}
HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}
-BASELINE_BRANCH="master"
+BASELINE_BRANCH="${BASELINE_BRANCH:-merge-train/barretenberg}"
BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools"
if [ ! -z "$(git status --untracked-files=no --porcelain)" ]; then
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md
new file mode 100644
index 000000000000..17e135237eeb
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md
@@ -0,0 +1,183 @@
+# Pippenger Multi-Scalar Multiplication (MSM)
+
+## Overview
+
+The Pippenger algorithm computes multi-scalar multiplications:
+
+$$\text{MSM}(\vec{s}, \vec{P}) = \sum_{i=0}^{n-1} s_i \cdot P_i$$
+
+**Complexity**: Let $q = \lceil \log_2(\text{field modulus}) \rceil$ be the scalar bit-length, $|A|$ the cost of a group addition, and $|D|$ the cost of a doubling.
+
+- **Pippenger**: $O\left(\frac{q}{c} \cdot \left((n + 2^c) \cdot |A| + c \cdot |D|\right)\right)$
+- **Naive**: $O(n \cdot q \cdot |D| + n \cdot q \cdot |A| / 2)$
+
+With $c \approx \frac{1}{2} \log_2 n$, Pippenger achieves roughly $O(n \cdot q / \log n)$ vs $O(n \cdot q)$ for naive scalar multiplication.
+
+## Algorithm
+
+### Step 1: Scalar Decomposition
+
+**Implementation**: `get_scalar_slice(scalar, round_index, bits_per_slice)`
+
+Each scalar $s_i$ is decomposed into $r$ slices of $c$ bits each, processed **MSB-first**:
+
+$$s_i = \sum_{j=0}^{r-1} s_i^{(j)} \cdot 2^{c(r-1-j)}$$
+
+- $c$ = bits per slice (from `get_optimal_log_num_buckets`, which brute-force searches for minimum cost)
+- $r = \lceil $ `NUM_BITS_IN_FIELD` $/ c \rceil$ = number of rounds
+- Round 0 extracts the most significant bits
+
+### Step 2: Bucket Accumulation
+
+For each round $j$, points are added into **buckets** based on their scalar slice. Bucket $k$ accumulates all points whose slice value equals $k$:
+
+$$B_k^{(j)} = \sum_{\{i : s_i^{(j)} = k\}} P_i$$
+
+**Two implementation paths:**
+
+- **Affine**: Sorts points by bucket and uses batched affine additions
+- **Jacobian**: Direct bucket accumulation in Jacobian coordinates
+
+### Step 3: Bucket Reduction
+
+**Implementation**: `accumulate_buckets(bucket_accumulators)`
+
+Computes weighted sum using a suffix sum (high to low):
+
+$$R^{(j)} = \sum_{k=1}^{2^c - 1} k \cdot B_k^{(j)} = \sum_{k=1}^{2^c - 1} \left( \sum_{m=k}^{2^c - 1} B_m^{(j)} \right)$$
+
+An offset generator is added and subtracted to avoid rare accumulator edge cases—a probabilistic mitigation that simplifies accumulation logic.
+
+### Step 4: Round Combination
+
+Combines all rounds using Horner's method (MSB-first):
+
+```cpp
+msm_accumulator = point_at_infinity
+for j = 0 to r-1:
+ repeat c doublings (or fewer for final round)
+ msm_accumulator += bucket_result[j]
+```
+
+## Algorithm Variants
+
+### Entry Points and Safety
+
+| Entry Point | Default | Safety |
+|-------------|---------|--------|
+| `msm()` | `handle_edge_cases=false` | ⚠️ **Unsafe** |
+| `pippenger()` | `handle_edge_cases=true` | ✓ Safe |
+| `pippenger_unsafe()` | `handle_edge_cases=false` | ⚠️ Unsafe |
+| `batch_multi_scalar_mul()` | `handle_edge_cases=true` | ✓ Safe |
+
+### Edge Cases
+
+Affine addition fails for **P = Q** (doubling), **P = −Q** (inverse), and **P = O** (identity). Jacobian coordinates handle these correctly at higher cost (~2-3× slower).
+
+⚠️ **Use `msm()` or `pippenger_unsafe()` only when points are guaranteed linearly independent** (e.g., SRS points). For user-controlled or potentially duplicate points, use `pippenger()`.
+
+### Affine Pippenger (`handle_edge_cases=false`)
+
+Uses affine coordinates with Montgomery's batch inversion trick: replaces $m$ inversions with **1 inversion + O(m) multiplications**, yielding ~2-3× speedup over Jacobian.
+
+### Jacobian Pippenger (`handle_edge_cases=true`)
+
+Uses Jacobian coordinates for bucket accumulators. Handles all edge cases correctly.
+
+## Tuning Constants
+
+| Constant | Value | Purpose |
+|----------|-------|---------|
+| `PIPPENGER_THRESHOLD` | 16 | Below this, use naive scalar multiplication |
+| `AFFINE_TRICK_THRESHOLD` | 128 | Below this, batch inversion overhead exceeds savings |
+| `MAX_SLICE_BITS` | 20 | Upper bound on bucket count exponent |
+| `BATCH_SIZE` | 2048 | Points per batch inversion (fits L2 cache) |
+| `RADIX_BITS` | 8 | Bits per radix sort pass |
+
+
+Cost model constants and derivations
+
+| Constant | Value | Derivation |
+|----------|-------|------------|
+| `BUCKET_ACCUMULATION_COST` | 5 | 2 Jacobian adds/bucket × 2.5× cost ratio |
+| `AFFINE_TRICK_SAVINGS_PER_OP` | 5 | ~10 muls saved − ~3 muls for product tree |
+| `JACOBIAN_Z_NOT_ONE_PENALTY` | 5 | Extra field ops when Z ≠ 1 |
+| `INVERSION_TABLE_COST` | 14 | 4-bit lookup table for modular exp |
+
+**BATCH_SIZE=2048**: Each `AffineElement` is 64 bytes. 2048 points = 128 KB, fitting in L2 cache.
+
+**RADIX_BITS=8**: 256 radix buckets × 4 bytes = 1 KB counting array, fits in L1 cache.
+
+
+
+## Implementation Notes
+
+### Zero Scalar Filtering
+
+`transform_scalar_and_get_nonzero_scalar_indices` filters out zero scalars before processing (since $0 \cdot P_i = \mathcal{O}$). Scalars are converted from Montgomery form in-place to avoid doubling memory usage.
+
+### Bucket Existence Tracking
+
+A `BitVector` bitmap tracks which buckets are populated, avoiding expensive full-array clears between rounds. Clearing the bitmap costs $O(2^c / 64)$ words vs $O(2^c)$ for the full bucket array.
+
+### Point Scheduling (Affine Variant Only)
+
+Entries are packed as `(point_index << 32) | bucket_index` into 64-bit values. Since bucket indices fit in $c$ bits (typically 8-16), they occupy only the lowest bits of the packed entry. An **in-place MSD radix sort** on the low $c$ bits groups points by bucket for efficient batch processing. The sort also detects entries with `bucket_index == 0` during the final radix pass, allowing zero-bucket entries to be skipped without a separate scan.
+
+### Batched Affine Addition
+
+`batch_accumulate_points_into_buckets` processes sorted points iteratively:
+- Same-bucket pairs → queue for batch addition
+- Different buckets → cache in bucket or queue with existing accumulator
+- Uses branchless conditional moves to minimize pipeline stalls
+- Prefetches future points to hide memory latency
+- Recirculates results to maximize batch efficiency before writing to buckets
+
+
+Batch accumulation case analysis
+
+| Condition | Action | Iterator Update |
+|-----------|--------|-----------------|
+| `bucket[i] == bucket[i+1]` | Queue both points for batch add | `point_it += 2` |
+| Different buckets, accumulator exists | Queue point + accumulator | `point_it += 1` |
+| Different buckets, no accumulator | Cache point into bucket | `point_it += 1` |
+
+After batch addition, results targeting the same bucket are paired again before writing to bucket accumulators, reducing random memory access by ~50%.
+
+
+
+## Parallelization
+
+Uses **per-thread buffers** (bucket accumulators, scratch space) to eliminate contention.
+
+For `batch_multi_scalar_mul()`, work is distributed via `MSMWorkUnit` structures that can split a single MSM across multiple threads. Each thread computes partial results on point subsets, combined in a final reduction.
+
+
+Per-call buffer sizes
+
+| Buffer | Size | Purpose |
+|--------|------|---------|
+| `BucketAccumulators` (affine) | $2^c × 64$ bytes | Affine bucket array + bitmap |
+| `JacobianBucketAccumulators` | $2^c × 96$ bytes | Jacobian bucket array + bitmap |
+| `AffineAdditionData` | ~400 KB | Scratch for batch inversion |
+| `point_schedule` | $n × 8$ bytes | Per-MSM point schedule |
+
+Buffers are allocated per-call for WASM compatibility. Memory scales with thread count during parallel execution.
+
+
+
+## File Structure
+
+```
+scalar_multiplication/
+├── scalar_multiplication.hpp # MSM class, data structures
+├── scalar_multiplication.cpp # Core algorithm
+├── process_buckets.hpp/cpp # Radix sort
+├── bitvector.hpp # Bit vector for bucket tracking
+└── README.md # This file
+```
+
+## References
+
+1. Pippenger, N. (1976). "On the evaluation of powers and related problems"
+2. Bernstein, D.J. et al. "Faster batch forgery identification" (batch inversion)
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp
index a6e767961b44..c527b7b7b1b4 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp
@@ -10,89 +10,97 @@
namespace bb::scalar_multiplication {
-// NOLINTNEXTLINE(misc-no-recursion) recursion is fine here, max recursion depth is 8 (64 bit int / 8 bits per call)
+// NOLINTNEXTLINE(misc-no-recursion) recursion is fine here, max depth is 4 (32-bit bucket index / 8 bits per call)
void radix_sort_count_zero_entries(uint64_t* keys,
const size_t num_entries,
const uint32_t shift,
size_t& num_zero_entries,
- const uint32_t total_bits,
- const uint64_t* start_pointer) noexcept
+ const uint32_t bucket_index_bits,
+ const uint64_t* top_level_keys) noexcept
{
- constexpr size_t num_bits = 8;
- constexpr size_t num_buckets = 1UL << num_bits;
- constexpr uint32_t mask = static_cast(num_buckets) - 1U;
- std::array bucket_counts{};
+ constexpr size_t NUM_RADIX_BUCKETS = 1UL << RADIX_BITS;
+ constexpr uint32_t RADIX_MASK = static_cast(NUM_RADIX_BUCKETS) - 1U;
+ // Step 1: Count entries in each radix bucket
+ std::array bucket_counts{};
for (size_t i = 0; i < num_entries; ++i) {
- bucket_counts[(keys[i] >> shift) & mask]++;
+ bucket_counts[(keys[i] >> shift) & RADIX_MASK]++;
}
- std::array offsets;
- std::array offsets_copy;
+ // Step 2: Convert counts to cumulative offsets (prefix sum)
+ std::array offsets;
+ std::array offsets_copy;
offsets[0] = 0;
-
- for (size_t i = 0; i < num_buckets - 1; ++i) {
+ for (size_t i = 0; i < NUM_RADIX_BUCKETS - 1; ++i) {
bucket_counts[i + 1] += bucket_counts[i];
}
- if ((shift == 0) && (keys == start_pointer)) {
+
+ // Count zero entries only at the final recursion level (shift == 0) and only for the full array
+ if ((shift == 0) && (keys == top_level_keys)) {
num_zero_entries = bucket_counts[0];
}
- for (size_t i = 1; i < num_buckets + 1; ++i) {
+
+ for (size_t i = 1; i < NUM_RADIX_BUCKETS + 1; ++i) {
offsets[i] = bucket_counts[i - 1];
}
- for (size_t i = 0; i < num_buckets + 1; ++i) {
+ for (size_t i = 0; i < NUM_RADIX_BUCKETS + 1; ++i) {
offsets_copy[i] = offsets[i];
}
- uint64_t* start = &keys[0];
- for (size_t i = 0; i < num_buckets; ++i) {
+ // Step 3: In-place permutation using cycle sort
+ // For each radix bucket, repeatedly swap elements to their correct positions until all elements
+ // in that bucket's range belong there. The offsets array tracks the next write position for each bucket.
+ uint64_t* start = &keys[0];
+ for (size_t i = 0; i < NUM_RADIX_BUCKETS; ++i) {
uint64_t* bucket_start = &keys[offsets[i]];
const uint64_t* bucket_end = &keys[offsets_copy[i + 1]];
while (bucket_start != bucket_end) {
for (uint64_t* it = bucket_start; it < bucket_end; ++it) {
- const size_t value = (*it >> shift) & mask;
+ const size_t value = (*it >> shift) & RADIX_MASK;
const uint64_t offset = offsets[value]++;
std::iter_swap(it, start + offset);
}
bucket_start = &keys[offsets[i]];
}
}
+
+ // Step 4: Recursively sort each bucket by the next less-significant byte
if (shift > 0) {
- for (size_t i = 0; i < num_buckets; ++i) {
- if (offsets_copy[i + 1] - offsets_copy[i] > 1) {
- radix_sort_count_zero_entries(&keys[offsets_copy[i]],
- offsets_copy[i + 1] - offsets_copy[i],
- shift - 8,
- num_zero_entries,
- total_bits,
- keys);
+ for (size_t i = 0; i < NUM_RADIX_BUCKETS; ++i) {
+ const size_t bucket_size = offsets_copy[i + 1] - offsets_copy[i];
+ if (bucket_size > 1) {
+ radix_sort_count_zero_entries(
+ &keys[offsets_copy[i]], bucket_size, shift - RADIX_BITS, num_zero_entries, bucket_index_bits, keys);
}
}
}
}
-size_t process_buckets_count_zero_entries(uint64_t* wnaf_entries,
- const size_t num_entries,
- const uint32_t num_bits) noexcept
+size_t sort_point_schedule_and_count_zero_buckets(uint64_t* point_schedule,
+ const size_t num_entries,
+ const uint32_t bucket_index_bits) noexcept
{
if (num_entries == 0) {
return 0;
}
- const uint32_t bits_per_round = 8;
- const uint32_t base = num_bits & 7;
- const uint32_t total_bits = (base == 0) ? num_bits : num_bits - base + 8;
- const uint32_t shift = total_bits - bits_per_round;
+
+ // Round bucket_index_bits up to next multiple of RADIX_BITS for proper MSD radix sort alignment.
+ // E.g., if bucket_index_bits=10, we need to start sorting from bit 16 (2 bytes) not bit 10.
+ const uint32_t remainder = bucket_index_bits % RADIX_BITS;
+ const uint32_t padded_bits = (remainder == 0) ? bucket_index_bits : bucket_index_bits - remainder + RADIX_BITS;
+ const uint32_t initial_shift = padded_bits - RADIX_BITS;
+
size_t num_zero_entries = 0;
- radix_sort_count_zero_entries(wnaf_entries, num_entries, shift, num_zero_entries, num_bits, wnaf_entries);
-
- // inside radix_sort_count_zero_entries, if the least significant *byte* of `wnaf_entries[0] == 0`,
- // then num_nonzero_entries = number of entries that share the same value as wnaf_entries[0].
- // If wnaf_entries[0] != 0, we must manually set num_zero_entries = 0
- if (num_entries > 0) {
- if ((wnaf_entries[0] & 0xffffffff) != 0) {
- num_zero_entries = 0;
- }
+ radix_sort_count_zero_entries(
+ point_schedule, num_entries, initial_shift, num_zero_entries, bucket_index_bits, point_schedule);
+
+ // The radix sort counts entries where the least significant BYTE is zero, but we need entries where
+ // the entire bucket_index (lower 32 bits) is zero. Verify the first entry after sorting.
+ if ((point_schedule[0] & BUCKET_INDEX_MASK) != 0) {
+ num_zero_entries = 0;
}
+
return num_zero_entries;
}
+
} // namespace bb::scalar_multiplication
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp
index e964fcc7adf0..7c5a8c68b881 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp
@@ -10,13 +10,47 @@
#include
namespace bb::scalar_multiplication {
+
+// Number of bits processed per radix sort pass
+constexpr uint32_t RADIX_BITS = 8;
+
+// Mask for extracting bucket index from point schedule entry (lower 32 bits)
+constexpr uint64_t BUCKET_INDEX_MASK = 0xffffffff;
+
+/**
+ * @brief Recursive MSD radix sort that also counts entries with zero bucket index
+ * @details Sorts point schedule entries by their bucket index (lower 32 bits) using most-significant-digit
+ * radix sort. Processes RADIX_BITS bits per recursive call, starting from the most significant byte.
+ * As a side effect, counts entries where bucket_index == 0 (which are skipped during MSM).
+ *
+ * @param keys Array of point schedule entries to sort in-place (format: point_index << 32 | bucket_index)
+ * @param num_entries Number of entries to sort
+ * @param shift Current bit position to sort on (starts high, decreases by RADIX_BITS each recursion)
+ * @param[out] num_zero_entries Count of entries with bucket_index == 0 (only set at final recursion level)
+ * @param bucket_index_bits Number of bits in the bucket index (used to determine recursion depth)
+ * @param top_level_keys Pointer to original array start; used to detect top-level call for zero counting
+ */
void radix_sort_count_zero_entries(uint64_t* keys,
- const size_t num_entries,
- const uint32_t shift,
+ size_t num_entries,
+ uint32_t shift,
size_t& num_zero_entries,
- const uint32_t total_bits,
- const uint64_t* start_pointer) noexcept;
-size_t process_buckets_count_zero_entries(uint64_t* wnaf_entries,
- const size_t num_entries,
- const uint32_t num_bits) noexcept;
+ uint32_t bucket_index_bits,
+ const uint64_t* top_level_keys) noexcept;
+
+/**
+ * @brief Sort point schedule by bucket index and count zero-bucket entries
+ * @details Entry point for radix sort. Sorts entries so that points targeting the same bucket
+ * are contiguous, improving cache locality during bucket accumulation. Also counts
+ * entries with bucket_index == 0, which represent scalar bits that don't contribute
+ * to any bucket in the current Pippenger round.
+ *
+ * @param point_schedule Array of packed entries (point_index << 32 | bucket_index) to sort in-place
+ * @param num_entries Number of entries
+ * @param bucket_index_bits Number of bits used for bucket indices (determines sort range)
+ * @return Number of entries with bucket_index == 0 (these are at the start after sorting)
+ */
+size_t sort_point_schedule_and_count_zero_buckets(uint64_t* point_schedule,
+ size_t num_entries,
+ uint32_t bucket_index_bits) noexcept;
+
} // namespace bb::scalar_multiplication
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
index d4199c0e6cd4..7e73e8ddbe66 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
@@ -21,21 +21,14 @@
namespace bb::scalar_multiplication {
-/**
- * @brief Fallback method for very small numbers of input points
- *
- * @tparam Curve
- * @param scalars
- * @param points
- * @param range
- * @return Curve::Element
- */
-template
-typename Curve::Element small_mul(std::span& scalars,
- std::span& points,
- std::span scalar_indices,
- size_t range) noexcept
+// Naive double-and-add fallback for small inputs (< PIPPENGER_THRESHOLD points).
+template typename Curve::Element small_mul(const typename MSM::MSMData& msm_data) noexcept
{
+ const auto& scalars = msm_data.scalars;
+ const auto& points = msm_data.points;
+ const auto& scalar_indices = msm_data.scalar_indices;
+ const size_t range = scalar_indices.size();
+
typename Curve::Element r = Curve::Group::point_at_infinity;
for (size_t i = 0; i < range; ++i) {
typename Curve::Element f = points[scalar_indices[i]];
@@ -44,21 +37,14 @@ typename Curve::Element small_mul(std::span&
return r;
}
-/**
- * @brief Convert scalar out of Montgomery form. Populate `consolidated_indices` with nonzero scalar indices
- *
- * @tparam Curve
- * @param scalars
- * @param consolidated_indices
- */
template
void MSM::transform_scalar_and_get_nonzero_scalar_indices(std::span scalars,
- std::vector& consolidated_indices) noexcept
+ std::vector& nonzero_scalar_indices) noexcept
{
std::vector> thread_indices(get_num_cpus());
+ // Pass 1: Each thread converts from Montgomery and collects nonzero indices into its own vector
parallel_for([&](const ThreadChunk& chunk) {
- // parallel_for with ThreadChunk uses get_num_cpus() threads
BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());
auto range = chunk.range(scalars.size());
if (range.empty()) {
@@ -71,9 +57,7 @@ void MSM::transform_scalar_and_get_nonzero_scalar_indices(std::span(i));
}
}
@@ -83,32 +67,21 @@ void MSM::transform_scalar_and_get_nonzero_scalar_indices(std::span::ThreadWorkUnits>
- */
template
std::vector::ThreadWorkUnits> MSM::get_work_units(
std::span> scalars, std::vector>& msm_scalar_indices) noexcept
@@ -117,7 +90,6 @@ std::vector::ThreadWorkUnits> MSM::get_work_units(
const size_t num_msms = scalars.size();
msm_scalar_indices.resize(num_msms);
for (size_t i = 0; i < num_msms; ++i) {
- BB_ASSERT_LT(i, scalars.size());
transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
}
@@ -130,12 +102,9 @@ std::vector::ThreadWorkUnits> MSM::get_work_units(
std::vector work_units(num_threads);
const size_t work_per_thread = numeric::ceil_div(total_work, num_threads);
- size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
+ const size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
- // [(MSMs + T - 1) / T] * [T - 1] > MSMs
- // T = 192
- // ([M + 191] / 192) * 193 > M
- // only use a single work unit if we don't have enough work for every thread
+ // Only use a single work unit if we don't have enough work for every thread
if (num_threads > total_work) {
for (size_t i = 0; i < num_msms; ++i) {
work_units[0].push_back(MSMWorkUnit{
@@ -150,31 +119,28 @@ std::vector::ThreadWorkUnits> MSM::get_work_units(
size_t thread_accumulated_work = 0;
size_t current_thread_idx = 0;
for (size_t i = 0; i < num_msms; ++i) {
- BB_ASSERT_DEBUG(i < msm_scalar_indices.size());
- size_t msm_work = msm_scalar_indices[i].size();
- size_t msm_size = msm_work;
- while (msm_work > 0) {
+ size_t msm_work_remaining = msm_scalar_indices[i].size();
+ const size_t initial_msm_work = msm_work_remaining;
+
+ while (msm_work_remaining > 0) {
+ BB_ASSERT_LT(current_thread_idx, work_units.size());
+
const size_t total_thread_work =
(current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread;
const size_t available_thread_work = total_thread_work - thread_accumulated_work;
+ const size_t work_to_assign = std::min(available_thread_work, msm_work_remaining);
- if (available_thread_work >= msm_work) {
- BB_ASSERT_LT(current_thread_idx, work_units.size());
- work_units[current_thread_idx].push_back(MSMWorkUnit{
- .batch_msm_index = i,
- .start_index = msm_size - msm_work,
- .size = msm_work,
- });
- thread_accumulated_work += msm_work;
- msm_work = 0;
- } else {
- BB_ASSERT_LT(current_thread_idx, work_units.size());
- work_units[current_thread_idx].push_back(MSMWorkUnit{
- .batch_msm_index = i,
- .start_index = msm_size - msm_work,
- .size = available_thread_work,
- });
- msm_work -= available_thread_work;
+ work_units[current_thread_idx].push_back(MSMWorkUnit{
+ .batch_msm_index = i,
+ .start_index = initial_msm_work - msm_work_remaining,
+ .size = work_to_assign,
+ });
+
+ thread_accumulated_work += work_to_assign;
+ msm_work_remaining -= work_to_assign;
+
+ // Move to next thread if current thread is full
+ if (thread_accumulated_work >= total_thread_work) {
current_thread_idx++;
thread_accumulated_work = 0;
}
@@ -184,142 +150,91 @@ std::vector::ThreadWorkUnits> MSM::get_work_units(
}
/**
- * @brief Given a scalar that is *NOT* in Montgomery form, extract a `slice_size`-bit chunk
- * @brief At round i, we extract `slice_size * (i-1)` to `slice_sice * i` most significant bits.
+ * @brief Extract a slice of bits from a scalar for Pippenger bucket assignment
+ * @details Extracts bits [lo_bit, hi_bit) from the scalar's raw limb representation.
+ * The scalar must already be converted out of Montgomery form.
+ *
+ * IMPORTANT RESTRICTIONS (optimized for Pippenger's specific usage pattern):
+ * - slice_size must be <= 32 bits (returns uint32_t)
+ * - The bit range must span at most 2 limbs (satisfied when slice_size <= 64)
+ * - hi_bit must be < 256 to avoid out-of-bounds access (satisfied since hi_bit <= NUM_BITS_IN_FIELD < 256)
*
- * @tparam Curve
- * @param scalar
- * @param round
- * @param normal_slice_size
- * @return uint32_t
+ * @param scalar The scalar field element (must be in non-Montgomery form)
+ * @param round The current Pippenger round (0 = most significant bits)
+ * @param slice_size Number of bits per slice
+ * @return uint32_t The bucket index for this round
*/
template
uint32_t MSM::get_scalar_slice(const typename Curve::ScalarField& scalar,
size_t round,
size_t slice_size) noexcept
{
+ constexpr size_t LIMB_BITS = 64;
+
size_t hi_bit = NUM_BITS_IN_FIELD - (round * slice_size);
- // todo remove
- bool last_slice = hi_bit < slice_size;
- size_t target_slice_size = last_slice ? hi_bit : slice_size;
- size_t lo_bit = last_slice ? 0 : hi_bit - slice_size;
- size_t start_limb = lo_bit / 64;
- size_t end_limb = hi_bit / 64;
- size_t lo_slice_offset = lo_bit & 63;
- size_t lo_slice_bits = std::min(target_slice_size, 64 - lo_slice_offset);
- size_t hi_slice_bits = target_slice_size - lo_slice_bits;
- size_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((static_cast(1) << lo_slice_bits) - 1);
- size_t hi_slice = (scalar.data[end_limb] & ((static_cast(1) << hi_slice_bits) - 1));
-
- uint32_t lo = static_cast(lo_slice);
- uint32_t hi = static_cast(hi_slice);
-
- uint32_t result = lo + (hi << lo_slice_bits);
- return result;
+ size_t lo_bit = (hi_bit < slice_size) ? 0 : hi_bit - slice_size;
+
+ BB_ASSERT_DEBUG(lo_bit < hi_bit);
+ BB_ASSERT_DEBUG(hi_bit <= NUM_BITS_IN_FIELD); // Ensures hi_bit < 256, so end_limb <= 3
+
+ size_t start_limb = lo_bit / LIMB_BITS;
+ size_t end_limb = hi_bit / LIMB_BITS;
+ size_t lo_slice_offset = lo_bit & (LIMB_BITS - 1);
+ size_t actual_slice_size = hi_bit - lo_bit;
+ size_t lo_slice_bits =
+ (LIMB_BITS - lo_slice_offset < actual_slice_size) ? (LIMB_BITS - lo_slice_offset) : actual_slice_size;
+ size_t hi_slice_bits = actual_slice_size - lo_slice_bits;
+
+ uint64_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((1ULL << lo_slice_bits) - 1);
+ uint64_t hi_slice = (start_limb != end_limb) ? (scalar.data[end_limb] & ((1ULL << hi_slice_bits) - 1)) : 0;
+
+ return static_cast(lo_slice | (hi_slice << lo_slice_bits));
}
-/**
- * @brief For a given number of points, compute the optimal Pippenger bucket size
- *
- * @tparam Curve
- * @param num_points
- * @return constexpr size_t
- */
-template size_t MSM::get_optimal_log_num_buckets(const size_t num_points) noexcept
+template uint32_t MSM::get_optimal_log_num_buckets(const size_t num_points) noexcept
{
- // We do 2 group operations per bucket, and they are full 3D Jacobian adds which are ~2x more than an affine add
- constexpr size_t COST_OF_BUCKET_OP_RELATIVE_TO_POINT = 5;
- size_t cached_cost = static_cast(-1);
- size_t target_bit_slice = 0;
- for (size_t bit_slice = 1; bit_slice < 20; ++bit_slice) {
- const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bit_slice);
- const size_t num_buckets = 1 << bit_slice;
- const size_t addition_cost = num_rounds * num_points;
- const size_t bucket_cost = num_rounds * num_buckets * COST_OF_BUCKET_OP_RELATIVE_TO_POINT;
- const size_t total_cost = addition_cost + bucket_cost;
- if (total_cost < cached_cost) {
- cached_cost = total_cost;
- target_bit_slice = bit_slice;
+ // Cost model: total_cost = num_rounds * (num_points + num_buckets * BUCKET_ACCUMULATION_COST)
+ auto compute_cost = [&](uint32_t bits) {
+ size_t rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, static_cast(bits));
+ size_t buckets = size_t{ 1 } << bits;
+ return rounds * (num_points + buckets * BUCKET_ACCUMULATION_COST);
+ };
+
+ uint32_t best_bits = 1;
+ size_t best_cost = compute_cost(1);
+ for (uint32_t bits = 2; bits < MAX_SLICE_BITS; ++bits) {
+ size_t cost = compute_cost(bits);
+ if (cost < best_cost) {
+ best_cost = cost;
+ best_bits = bits;
}
}
- return target_bit_slice;
+ return best_bits;
}
-/**
- * @brief Given a number of points and an optimal bucket size, should we use the affine trick?
- *
- * @tparam Curve
- * @param num_points
- * @param num_buckets
- * @return true
- * @return false
- */
template bool MSM::use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept
{
- if (num_points < 128) {
+ if (num_points < AFFINE_TRICK_THRESHOLD) {
return false;
}
// Affine trick requires log(N) modular inversions per Pippenger round.
- // It saves NUM_POINTS * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION field muls
- // It also saves NUM_BUCKETS * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 field muls
- // due to all our buckets having Z=1 if we use the affine trick
-
- // COST_OF_INVERSION cost:
- // Requires NUM_BITS_IN_FIELD sqarings
- // We use 4-bit windows = ((NUM_BITS_IN_FIELD + 3) / 4) multiplications
- // Computing 4-bit window table requires 14 muls
- constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + 14;
- constexpr size_t COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION = 5;
- constexpr size_t EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 = 5;
-
- double num_points_f = static_cast(num_points);
- double log2_num_points_f = log2(num_points_f);
-
- size_t group_op_cost_saving_per_round = (num_points * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION) +
- (num_buckets * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1);
- double inversion_cost_per_round = log2_num_points_f * static_cast(COST_OF_INVERSION);
-
- return static_cast(group_op_cost_saving_per_round) > inversion_cost_per_round;
+ // It saves num_points * AFFINE_TRICK_SAVINGS_PER_OP field muls, plus
+ // num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY field muls (buckets have Z=1 with affine trick)
+
+ // Cost of modular inversion via exponentiation:
+ // - NUM_BITS_IN_FIELD squarings
+ // - (NUM_BITS_IN_FIELD + 3) / 4 multiplications (4-bit windows)
+ // - INVERSION_TABLE_COST multiplications for lookup table
+ constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + INVERSION_TABLE_COST;
+
+ double log2_num_points = log2(static_cast(num_points));
+ size_t savings_per_round = (num_points * AFFINE_TRICK_SAVINGS_PER_OP) + (num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY);
+ double inversion_cost_per_round = log2_num_points * static_cast(COST_OF_INVERSION);
+
+ return static_cast(savings_per_round) > inversion_cost_per_round;
}
-/**
- * @brief adds a bunch of points together using affine addition formulae.
- * @details Paradoxically, the affine formula is crazy efficient if you have a lot of independent point additions to
- * perform. Affine formula:
- *
- * \lambda = (y_2 - y_1) / (x_2 - x_1)
- * x_3 = \lambda^2 - (x_2 + x_1)
- * y_3 = \lambda*(x_1 - x_3) - y_1
- *
- * Traditionally, we avoid affine formulae like the plague, because computing lambda requires a modular inverse,
- * which is outrageously expensive.
- *
- * However! We can use Montgomery's batch inversion technique to amortise the cost of the inversion to ~0.
- *
- * The way batch inversion works is as follows. Let's say you want to compute \{ 1/x_1, 1/x_2, ..., 1/x_n \}
- * The trick is to compute the product x_1x_2...x_n , whilst storing all of the temporary products.
- * i.e. we have an array A = [x_1, x_1x_2, ..., x_1x_2...x_n]
- * We then compute a single inverse: I = 1 / x_1x_2...x_n
- * Finally, we can use our accumulated products, to quotient out individual inverses.
- * We can get an individual inverse at index i, by computing I.A_{i-1}.(x_nx_n-1...x_i+1)
- * The last product term we can compute on-the-fly, as it grows by one element for each additional inverse that we
- * require.
- *
- * TLDR: amortized cost of a modular inverse is 3 field multiplications per inverse.
- * Which means we can compute a point addition with SIX field multiplications in total.
- * The traditional Jacobian-coordinate formula requires 11.
- *
- * There is a catch though - we need large sequences of independent point additions!
- * i.e. the output from one point addition in the sequence is NOT an input to any other point addition in the
- * sequence.
- *
- * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated
- * @tparam Curve
- * @param points points to be added pairwise; result is stored in the latter half of the array
- * @param num_points
- * @param scratch_space coordinate field scratch space needed for batched inversion
- **/
template
void MSM::add_affine_points(typename Curve::AffineElement* points,
const size_t num_points,
@@ -328,12 +243,14 @@ void MSM::add_affine_points(typename Curve::AffineElement* points,
using Fq = typename Curve::BaseField;
Fq batch_inversion_accumulator = Fq::one();
+ // Forward pass: prepare batch inversion inputs.
+ // We reuse points[i+1] storage: .x stores (x2-x1), .y stores (y2-y1)*accumulator
for (size_t i = 0; i < num_points; i += 2) {
- scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
- points[i + 1].x -= points[i].x; // x2 - x1
- points[i + 1].y -= points[i].y; // y2 - y1
+ scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 (needed later for x3)
+ points[i + 1].x -= points[i].x; // x2 - x1 (denominator for lambda)
+ points[i + 1].y -= points[i].y; // y2 - y1 (numerator for lambda)
points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old
- batch_inversion_accumulator *= (points[i + 1].x);
+ batch_inversion_accumulator *= (points[i + 1].x); // accumulate denominators
}
if (batch_inversion_accumulator == 0) {
// prefer abort to throw for code that might emit from multiple threads
@@ -342,421 +259,215 @@ void MSM::add_affine_points(typename Curve::AffineElement* points,
batch_inversion_accumulator = batch_inversion_accumulator.invert();
}
- // Iterate backwards through the points, comnputing pairwise affine additions; addition results are stored in the
- // latter half of the array
+ // Backward pass: compute additions using batch inversion results.
+ // Reusing points[i+1] storage: .y becomes lambda, .x becomes lambda^2.
+ // Results are written to the top half of points array: points[(i+num_points)/2].
+ // Loop terminates when i underflows (becomes > num_points for unsigned).
for (size_t i = (num_points)-2; i < num_points; i -= 2) {
- points[i + 1].y *= batch_inversion_accumulator; // update accumulator
- batch_inversion_accumulator *= points[i + 1].x;
- points[i + 1].x = points[i + 1].y.sqr();
- points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2
- // - x1
- // Memory bandwidth is a bit of a bottleneck here.
- // There's probably a more elegant way of structuring our data so we don't need to do all of this
- // prefetching
+ points[i + 1].y *= batch_inversion_accumulator; // .y now holds lambda = (y2-y1)/(x2-x1)
+ batch_inversion_accumulator *= points[i + 1].x; // restore accumulator for next iteration
+ points[i + 1].x = points[i + 1].y.sqr(); // .x now holds lambda^2
+ points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda^2 - x2 - x1
+ // Output addresses jump non-sequentially: points[(i+n)>>1] defeats hardware prefetcher.
+ // Fetching 2 iterations ahead ensures data arrives before needed.
if (i >= 2) {
__builtin_prefetch(points + i - 2);
__builtin_prefetch(points + i - 1);
__builtin_prefetch(points + ((i + num_points - 2) >> 1));
__builtin_prefetch(scratch_space + ((i - 2) >> 1));
}
- points[i].x -= points[(i + num_points) >> 1].x;
- points[i].x *= points[i + 1].y;
- points[(i + num_points) >> 1].y = points[i].x - points[i].y;
+ // Compute y3 = lambda * (x1 - x3) - y1, reusing points[i].x as temp storage
+ points[i].x -= points[(i + num_points) >> 1].x; // x1 - x3
+ points[i].x *= points[i + 1].y; // lambda * (x1 - x3)
+ points[(i + num_points) >> 1].y = points[i].x - points[i].y; // y3 = lambda*(x1-x3) - y1
}
}
-/**
- * @brief Top-level Pippenger algorithm where number of points is small and we are not using the Affine trick
- *
- * @tparam Curve
- * @param msm_data
- * @return Curve::AffineElement
- */
template
-typename Curve::Element MSM::small_pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept
+typename Curve::Element MSM::jacobian_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept
{
- std::span& nonzero_scalar_indices = msm_data.scalar_indices;
- const size_t size = nonzero_scalar_indices.size();
- const size_t bits_per_slice = get_optimal_log_num_buckets(size);
- const size_t num_buckets = 1 << bits_per_slice;
- JacobianBucketAccumulators bucket_data = JacobianBucketAccumulators(num_buckets);
- Element round_output = Curve::Group::point_at_infinity;
+ const size_t size = msm_data.scalar_indices.size();
+ const uint32_t bits_per_slice = get_optimal_log_num_buckets(size);
+ const size_t num_buckets = size_t{ 1 } << bits_per_slice;
+ const uint32_t num_rounds = static_cast((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
+ const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;
+
+ JacobianBucketAccumulators bucket_data(num_buckets);
+ Element msm_result = Curve::Group::point_at_infinity;
+
+ for (uint32_t round = 0; round < num_rounds; ++round) {
+ // Populate buckets using Jacobian accumulation
+ for (size_t i = 0; i < size; ++i) {
+ uint32_t idx = msm_data.scalar_indices[i];
+ uint32_t bucket = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);
+ if (bucket > 0) {
+ if (bucket_data.bucket_exists.get(bucket)) {
+ bucket_data.buckets[bucket] += msm_data.points[idx];
+ } else {
+ bucket_data.buckets[bucket] = msm_data.points[idx];
+ bucket_data.bucket_exists.set(bucket, true);
+ }
+ }
+ }
- const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
+ // Reduce buckets and accumulate into result
+ Element bucket_result = accumulate_buckets(bucket_data);
+ bucket_data.bucket_exists.clear();
- for (size_t i = 0; i < num_rounds; ++i) {
- round_output = evaluate_small_pippenger_round(msm_data, i, bucket_data, round_output, bits_per_slice);
+ uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;
+ for (uint32_t i = 0; i < num_doublings; ++i) {
+ msm_result.self_dbl();
+ }
+ msm_result += bucket_result;
}
- return round_output;
+ return msm_result;
}
-/**
- * @brief Top-level Pippenger algorithm
- *
- * @tparam Curve
- * @param msm_data
- * @return Curve::AffineElement
- */
template
-typename Curve::Element MSM::pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept
+typename Curve::Element MSM::affine_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept
{
- const size_t msm_size = msm_data.scalar_indices.size();
- const size_t bits_per_slice = get_optimal_log_num_buckets(msm_size);
- const size_t num_buckets = 1 << bits_per_slice;
+ const size_t num_points = msm_data.scalar_indices.size();
+ const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points);
+ const size_t num_buckets = size_t{ 1 } << bits_per_slice;
- if (!use_affine_trick(msm_size, num_buckets)) {
- return small_pippenger_low_memory_with_transformed_scalars(msm_data);
+ if (!use_affine_trick(num_points, num_buckets)) {
+ return jacobian_pippenger_with_transformed_scalars(msm_data);
}
- // TODO(https://github.com/AztecProtocol/barretenberg/issues/1452): Consider allowing this memory to persist rather
- // than allocating/deallocating on every execution.
- AffineAdditionData affine_data = AffineAdditionData();
- BucketAccumulators bucket_data = BucketAccumulators(num_buckets);
- Element round_output = Curve::Group::point_at_infinity;
+ const uint32_t num_rounds = static_cast((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
+ const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;
- const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
- for (size_t i = 0; i < num_rounds; ++i) {
- round_output = evaluate_pippenger_round(msm_data, i, affine_data, bucket_data, round_output, bits_per_slice);
- }
+ // Per-call allocation for WASM compatibility (thread_local causes issues in WASM)
+ AffineAdditionData affine_data;
+ BucketAccumulators bucket_data(num_buckets);
- return (round_output);
-}
+ Element msm_result = Curve::Group::point_at_infinity;
-/**
- * @brief Evaluate a single Pippenger round when we do not use the Affine trick
- *
- * @tparam Curve
- * @param msm_data
- * @param round_index
- * @param bucket_data
- * @param previous_round_output
- * @param bits_per_slice
- * @return Curve::Element
- */
-template
-typename Curve::Element MSM::evaluate_small_pippenger_round(MSMData& msm_data,
- const size_t round_index,
- MSM::JacobianBucketAccumulators& bucket_data,
- typename Curve::Element previous_round_output,
- const size_t bits_per_slice) noexcept
-{
- std::span& nonzero_scalar_indices = msm_data.scalar_indices;
- std::span& scalars = msm_data.scalars;
- std::span& points = msm_data.points;
-
- const size_t size = nonzero_scalar_indices.size();
- for (size_t i = 0; i < size; ++i) {
- BB_ASSERT_DEBUG(nonzero_scalar_indices[i] < scalars.size());
- uint32_t bucket_index = get_scalar_slice(scalars[nonzero_scalar_indices[i]], round_index, bits_per_slice);
- BB_ASSERT_DEBUG(bucket_index < static_cast(1 << bits_per_slice));
- if (bucket_index > 0) {
- // do this check because we do not reset bucket_data.buckets after each round
- // (i.e. not neccessarily at infinity)
- if (bucket_data.bucket_exists.get(bucket_index)) {
- bucket_data.buckets[bucket_index] += points[nonzero_scalar_indices[i]];
- } else {
- bucket_data.buckets[bucket_index] = points[nonzero_scalar_indices[i]];
- bucket_data.bucket_exists.set(bucket_index, true);
+ for (uint32_t round = 0; round < num_rounds; ++round) {
+ // Build point schedule for this round
+ {
+ for (size_t i = 0; i < num_points; ++i) {
+ uint32_t idx = msm_data.scalar_indices[i];
+ uint32_t bucket_idx = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);
+ msm_data.point_schedule[i] = PointScheduleEntry::create(idx, bucket_idx).data;
}
}
- }
- Element round_output;
- round_output.self_set_infinity();
- round_output = accumulate_buckets(bucket_data);
- bucket_data.bucket_exists.clear();
- Element result = previous_round_output;
- const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
- size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
- ? NUM_BITS_IN_FIELD % bits_per_slice
- : bits_per_slice;
- for (size_t i = 0; i < num_doublings; ++i) {
- result.self_dbl();
- }
- result += round_output;
- return result;
-}
-
-/**
- * @brief Evaluate a single Pippenger round where we use the affine trick
- *
- * @tparam Curve
- * @param msm_data
- * @param round_index
- * @param affine_data
- * @param bucket_data
- * @param previous_round_output
- * @param bits_per_slice
- * @return Curve::Element
- */
-template
-typename Curve::Element MSM::evaluate_pippenger_round(MSMData& msm_data,
- const size_t round_index,
- MSM::AffineAdditionData& affine_data,
- MSM::BucketAccumulators& bucket_data,
- typename Curve::Element previous_round_output,
- const size_t bits_per_slice) noexcept
-{
- std::span& scalar_indices = msm_data.scalar_indices; // indices of nonzero scalars
- std::span& scalars = msm_data.scalars;
- std::span& points = msm_data.points;
- std::span& round_schedule = msm_data.point_schedule;
- const size_t size = scalar_indices.size();
-
- // Construct a "round schedule". Each entry describes:
- // 1. low 32 bits: which bucket index do we add the point into? (bucket index = slice value)
- // 2. high 32 bits: which point index do we source the point from?
- for (size_t i = 0; i < size; ++i) {
- BB_ASSERT_DEBUG(scalar_indices[i] < scalars.size());
- round_schedule[i] = get_scalar_slice(scalars[scalar_indices[i]], round_index, bits_per_slice);
- round_schedule[i] += (static_cast(scalar_indices[i]) << 32ULL);
- }
- // Sort our point schedules based on their bucket values. Reduces memory throughput in next step of algo
- const size_t num_zero_entries = scalar_multiplication::process_buckets_count_zero_entries(
- &round_schedule[0], size, static_cast(bits_per_slice));
- BB_ASSERT_DEBUG(num_zero_entries <= size);
- const size_t round_size = size - num_zero_entries;
-
- Element round_output;
- round_output.self_set_infinity();
-
- if (round_size > 0) {
- std::span point_schedule(&round_schedule[num_zero_entries], round_size);
- // Iterate through our point schedule and add points into corresponding buckets
- consume_point_schedule(point_schedule, points, affine_data, bucket_data, 0, 0);
- round_output = accumulate_buckets(bucket_data);
- bucket_data.bucket_exists.clear();
- }
+ // Sort by bucket and count zero-bucket entries
+ size_t num_zero_bucket_entries =
+ sort_point_schedule_and_count_zero_buckets(&msm_data.point_schedule[0], num_points, bits_per_slice);
+ size_t round_size = num_points - num_zero_bucket_entries;
+
+ // Accumulate points into buckets
+ Element bucket_result = Curve::Group::point_at_infinity;
+ if (round_size > 0) {
+ std::span schedule(&msm_data.point_schedule[num_zero_bucket_entries], round_size);
+ batch_accumulate_points_into_buckets(schedule, msm_data.points, affine_data, bucket_data);
+ bucket_result = accumulate_buckets(bucket_data);
+ bucket_data.bucket_exists.clear();
+ }
- Element result = previous_round_output;
- const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
- size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
- ? NUM_BITS_IN_FIELD % bits_per_slice
- : bits_per_slice;
- for (size_t i = 0; i < num_doublings; ++i) {
- result.self_dbl();
+ // Combine into running result
+ uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;
+ for (uint32_t i = 0; i < num_doublings; ++i) {
+ msm_result.self_dbl();
+ }
+ msm_result += bucket_result;
}
- result += round_output;
- return result;
+ return msm_result;
}
-/**
- * @brief Given a list of points and target buckets to add into, perform required group operations
- * @details This algorithm uses exclusively affine group operations, using batch inversions to amortise costs
- *
- * @tparam Curve
- * @param point_schedule
- * @param points
- * @param affine_data
- * @param bucket_data
- * @param num_input_points_processed
- * @param num_queued_affine_points
- */
template
-void MSM::consume_point_schedule(std::span point_schedule,
- std::span points,
- MSM::AffineAdditionData& affine_data,
- MSM::BucketAccumulators& bucket_data,
- size_t num_input_points_processed,
- size_t num_queued_affine_points) noexcept
+void MSM::batch_accumulate_points_into_buckets(std::span point_schedule,
+ std::span points,
+ MSM::AffineAdditionData& affine_data,
+ MSM::BucketAccumulators& bucket_data) noexcept
{
- size_t point_it = num_input_points_processed;
- size_t affine_input_it = num_queued_affine_points;
- // N.B. points and point_schedule MAY HAVE DIFFERENT SIZES
- // We source the number of actual points we work on from the point schedule
- size_t num_points = point_schedule.size();
- auto& bucket_accumulator_exists = bucket_data.bucket_exists;
- auto& affine_addition_scratch_space = affine_data.points_to_add;
- auto& bucket_accumulators = bucket_data.buckets;
- auto& affine_addition_output_bucket_destinations = affine_data.addition_result_bucket_destinations;
- auto& scalar_scratch_space = affine_data.scalar_scratch_space;
- auto& output_point_schedule = affine_data.addition_result_bucket_destinations;
- AffineElement null_location{};
- // We do memory prefetching, `prefetch_max` ensures we do not overflow our containers
- size_t prefetch_max = (num_points - 32);
- if (num_points < 32) {
- prefetch_max = 0;
- }
- size_t end = num_points - 1;
- if (num_points == 0) {
- end = 0;
- }
+ if (point_schedule.empty()) {
+ return;
+ }
+
+ size_t point_it = 0;
+ size_t scratch_it = 0;
+ const size_t num_points = point_schedule.size();
+ const size_t prefetch_max = (num_points >= PREFETCH_LOOKAHEAD) ? (num_points - PREFETCH_LOOKAHEAD) : 0;
+ const size_t last_index = num_points - 1;
+
+ // Iterative loop - continues until all points processed and no work remains in scratch space
+ while (point_it < num_points || scratch_it != 0) {
+ // Step 1: Fill scratch space with up to BATCH_SIZE/2 independent additions
+ while (((scratch_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < last_index)) {
+ // Prefetch points we'll need soon (every PREFETCH_INTERVAL iterations)
+ if ((point_it < prefetch_max) && ((point_it & PREFETCH_INTERVAL_MASK) == 0)) {
+ for (size_t i = PREFETCH_LOOKAHEAD / 2; i < PREFETCH_LOOKAHEAD; ++i) {
+ PointScheduleEntry entry{ point_schedule[point_it + i] };
+ __builtin_prefetch(&points[entry.point_index()]);
+ }
+ }
+
+ PointScheduleEntry lhs{ point_schedule[point_it] };
+ PointScheduleEntry rhs{ point_schedule[point_it + 1] };
+
+ process_bucket_pair(lhs.bucket_index(),
+ rhs.bucket_index(),
+ &points[lhs.point_index()],
+ &points[rhs.point_index()],
+ affine_data,
+ bucket_data,
+ scratch_it,
+ point_it);
+ }
- // Step 1: Fill up `affine_addition_scratch_space` with up to AffineAdditionData::BATCH_SIZE/2 independent additions
- while (((affine_input_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < end)) {
+ // Handle the last point (odd count case) - separate to avoid bounds check on point_schedule[point_it + 1]
+ if (point_it == last_index) {
+ PointScheduleEntry last{ point_schedule[point_it] };
+ process_single_point(
+ last.bucket_index(), &points[last.point_index()], affine_data, bucket_data, scratch_it, point_it);
+ }
- // we prefetchin'
- if ((point_it < prefetch_max) && ((point_it & 0x0f) == 0)) {
- for (size_t i = 16; i < 32; ++i) {
- __builtin_prefetch(&points[(point_schedule[point_it + i] >> 32ULL)]);
- }
+ // Compute independent additions using Montgomery's batch inversion trick
+ size_t num_points_to_add = scratch_it;
+ if (num_points_to_add >= 2) {
+ add_affine_points(
+ affine_data.points_to_add.data(), num_points_to_add, affine_data.inversion_scratch_space.data());
}
- // We do some branchless programming here to minimize instruction pipeline flushes
- // TODO(@zac-williamson, cc @ludamad) check these ternary operators are not branching! -> (ludamad: they don't,
- // but its not clear that the conditional move is fundamentally less expensive)
- // We are iterating through our points and
- // can come across the following scenarios: 1: The next 2 points in `point_schedule` belong to the *same* bucket
- // (happy path - can put both points into affine_addition_scratch_space)
- // 2: The next 2 points have different bucket destinations AND point_schedule[point_it].bucket contains a point
- // (happyish path - we can put points[lhs_schedule] and buckets[lhs_bucket] into
- // affine_addition_scratch_space)
- // 3: The next 2 points have different bucket destionations AND point_schedule[point_it].bucket is empty
- // We cache points[lhs_schedule] into buckets[lhs_bucket]
- // We iterate `point_it` by 2 (case 1), or by 1 (case 2 or 3). The number of points we add into
- // `affine_addition_scratch_space` is 2 (case 1 or 2) or 0 (case 3).
- uint64_t lhs_schedule = point_schedule[point_it];
- uint64_t rhs_schedule = point_schedule[point_it + 1];
- size_t lhs_bucket = static_cast(lhs_schedule) & 0xFFFFFFFF;
- size_t rhs_bucket = static_cast(rhs_schedule) & 0xFFFFFFFF;
- size_t lhs_point = static_cast(lhs_schedule >> 32);
- size_t rhs_point = static_cast(rhs_schedule >> 32);
-
- bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
- bool buckets_match = lhs_bucket == rhs_bucket;
- bool do_affine_add = buckets_match || has_bucket_accumulator;
-
- const AffineElement* lhs_source = &points[lhs_point];
- const AffineElement* rhs_source = buckets_match ? &points[rhs_point] : &bucket_accumulators[lhs_bucket];
-
- // either two points are set to be added (point to point or point into bucket accumulator), or lhs is stored in
- // the bucket and rhs is temporarily ignored
- AffineElement* lhs_destination =
- do_affine_add ? &affine_addition_scratch_space[affine_input_it] : &bucket_accumulators[lhs_bucket];
- AffineElement* rhs_destination =
- do_affine_add ? &affine_addition_scratch_space[affine_input_it + 1] : &null_location;
-
- // if performing an affine add, set the destination bucket corresponding to the addition result
- uint64_t& source_bucket_destination = affine_addition_output_bucket_destinations[affine_input_it >> 1];
- source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
-
- // unconditional swap. No if statements here.
- *lhs_destination = *lhs_source;
- *rhs_destination = *rhs_source;
-
- // indicate whether bucket_accumulators[lhs_bucket] will contain a point after this iteration
- bucket_accumulator_exists.set(
- lhs_bucket,
- (has_bucket_accumulator && buckets_match) || /* bucket has an accum and its not being used in current add */
- !do_affine_add); /* lhs point is cached into the bucket */
-
- affine_input_it += do_affine_add ? 2 : 0;
- point_it += (do_affine_add && buckets_match) ? 2 : 1;
- }
- // We have to handle the last point as an edge case so that we dont overflow the bounds of `point_schedule`. If the
- // bucket accumulator exists, we add the point to it, otherwise the point simply becomes the bucket accumulator.
- if (point_it == num_points - 1) {
- uint64_t lhs_schedule = point_schedule[point_it];
- size_t lhs_bucket = static_cast(lhs_schedule) & 0xFFFFFFFF;
- size_t lhs_point = static_cast(lhs_schedule >> 32);
- bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
-
- if (has_bucket_accumulator) { // point is added to its bucket accumulator
- affine_addition_scratch_space[affine_input_it] = points[lhs_point];
- affine_addition_scratch_space[affine_input_it + 1] = bucket_accumulators[lhs_bucket];
- bucket_accumulator_exists.set(lhs_bucket, false);
- affine_addition_output_bucket_destinations[affine_input_it >> 1] = lhs_bucket;
- affine_input_it += 2;
- point_it += 1;
- } else { // otherwise, cache the point into the bucket
- BB_ASSERT_DEBUG(lhs_point < points.size());
- bucket_accumulators[lhs_bucket] = points[lhs_point];
- bucket_accumulator_exists.set(lhs_bucket, true);
- point_it += 1;
+ // add_affine_points stores results in the top-half of scratch space
+ AffineElement* affine_output = affine_data.points_to_add.data() + (num_points_to_add / 2);
+
+ // Recirculate addition outputs back into scratch space or bucket accumulators
+ size_t new_scratch_it = 0;
+ size_t output_it = 0;
+ size_t num_outputs = num_points_to_add / 2;
+
+ while ((num_outputs > 1) && (output_it + 1 < num_outputs)) {
+ uint32_t lhs_bucket = affine_data.addition_result_bucket_destinations[output_it];
+ uint32_t rhs_bucket = affine_data.addition_result_bucket_destinations[output_it + 1];
+
+ process_bucket_pair(lhs_bucket,
+ rhs_bucket,
+ &affine_output[output_it],
+ &affine_output[output_it + 1],
+ affine_data,
+ bucket_data,
+ new_scratch_it,
+ output_it);
}
- }
- // Now that we have populated `affine_addition_scratch_space`,
- // compute `num_affine_points_to_add` independent additions using the Affine trick
- size_t num_affine_points_to_add = affine_input_it;
- if (num_affine_points_to_add >= 2) {
- add_affine_points(&affine_addition_scratch_space[0], num_affine_points_to_add, &scalar_scratch_space[0]);
- }
- // `add_affine_points` stores the result in the top-half of the used scratch space
- G1* affine_output = &affine_addition_scratch_space[0] + (num_affine_points_to_add / 2);
-
- // Process the addition outputs.
- // We either need to feed the addition outputs back into affine_addition_scratch_space for more addition operations.
- // Or, if there are no more additions for a bucket, we store the addition output in a bucket accumulator.
- size_t new_scratch_space_it = 0;
- size_t affine_output_it = 0;
- size_t num_affine_output_points = num_affine_points_to_add / 2;
- // This algorithm is equivalent to the one we used to populate `affine_addition_scratch_space` from the point
- // schedule, however here we source points from a different location (the addition results)
- while ((affine_output_it < (num_affine_output_points - 1)) && (num_affine_output_points > 0)) {
- size_t lhs_bucket = static_cast(affine_addition_output_bucket_destinations[affine_output_it]);
- size_t rhs_bucket = static_cast(affine_addition_output_bucket_destinations[affine_output_it + 1]);
- BB_ASSERT_DEBUG(lhs_bucket < bucket_accumulator_exists.size());
-
- bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
- bool buckets_match = (lhs_bucket == rhs_bucket);
- bool do_affine_add = buckets_match || has_bucket_accumulator;
-
- const AffineElement* lhs_source = &affine_output[affine_output_it];
- const AffineElement* rhs_source =
- buckets_match ? &affine_output[affine_output_it + 1] : &bucket_accumulators[lhs_bucket];
-
- AffineElement* lhs_destination =
- do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it] : &bucket_accumulators[lhs_bucket];
- AffineElement* rhs_destination =
- do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it + 1] : &null_location;
-
- uint64_t& source_bucket_destination = output_point_schedule[new_scratch_space_it >> 1];
- source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
-
- *lhs_destination = *lhs_source;
- *rhs_destination = *rhs_source;
-
- bucket_accumulator_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
- new_scratch_space_it += do_affine_add ? 2 : 0;
- affine_output_it += (do_affine_add && buckets_match) ? 2 : 1;
- }
- // perform final iteration as edge case so we don't overflow `affine_addition_output_bucket_destinations`
- if (affine_output_it == (num_affine_output_points - 1)) {
-
- size_t lhs_bucket = static_cast(affine_addition_output_bucket_destinations[affine_output_it]);
-
- bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
- if (has_bucket_accumulator) {
- BB_ASSERT_DEBUG(new_scratch_space_it + 1 < affine_addition_scratch_space.size());
- BB_ASSERT_DEBUG(lhs_bucket < bucket_accumulators.size());
- BB_ASSERT_DEBUG((new_scratch_space_it >> 1) < output_point_schedule.size());
- affine_addition_scratch_space[new_scratch_space_it] = affine_output[affine_output_it];
- affine_addition_scratch_space[new_scratch_space_it + 1] = bucket_accumulators[lhs_bucket];
- bucket_accumulator_exists.set(lhs_bucket, false);
- output_point_schedule[new_scratch_space_it >> 1] = lhs_bucket;
- new_scratch_space_it += 2;
- affine_output_it += 1;
- } else {
- bucket_accumulators[lhs_bucket] = affine_output[affine_output_it];
- bucket_accumulator_exists.set(lhs_bucket, true);
- affine_output_it += 1;
+ // Handle the last output (odd count case)
+ if (num_outputs > 0 && output_it == num_outputs - 1) {
+ uint32_t bucket = affine_data.addition_result_bucket_destinations[output_it];
+ process_single_point(
+ bucket, &affine_output[output_it], affine_data, bucket_data, new_scratch_it, output_it);
}
- }
- // If we have not finished iterating over the point schedule,
- // OR we have affine additions to perform in the scratch space, continue
- if (point_it < num_points || new_scratch_space_it != 0) {
- consume_point_schedule(point_schedule, points, affine_data, bucket_data, point_it, new_scratch_space_it);
+ // Continue with recirculated points
+ scratch_it = new_scratch_it;
}
}
-/**
- * @brief Compute multiple multi-scalar multiplications.
- * @details If we need to perform multiple MSMs, this method will be more efficient than calling `msm` repeatedly
- * This is because this method will be able to dispatch equal work to all threads without splitting the input
- * msms up so much.
- * The Pippenger algorithm runtime is O(N/log(N)) so there will be slight gains as each inner-thread MSM will
- * have a larger N.
- * The input scalars are not const because the algorithm converts them out of Montgomery form and then back.
- *
- * @tparam Curve
- * @param points
- * @param scalars
- * @return std::vector
- */
template
std::vector MSM::batch_multi_scalar_mul(
std::span> points,
@@ -772,32 +483,29 @@ std::vector MSM::batch_multi_scalar_mul(
std::vector>> thread_msm_results(num_cpus);
BB_ASSERT_EQ(thread_work_units.size(), num_cpus);
+ // Select Pippenger implementation once (hoisting branch outside hot loop)
+ // Jacobian: safe, handles edge cases | Affine: faster, assumes linearly independent points
+ auto pippenger_impl =
+ handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;
+
// Once we have our work units, each thread can independently evaluate its assigned msms
parallel_for(num_cpus, [&](size_t thread_idx) {
if (!thread_work_units[thread_idx].empty()) {
const std::vector& msms = thread_work_units[thread_idx];
std::vector>& msm_results = thread_msm_results[thread_idx];
+ msm_results.reserve(msms.size());
+
+ // Point schedule buffer for this thread - avoids per-work-unit heap allocation
+ std::vector point_schedule_buffer;
+
for (const MSMWorkUnit& msm : msms) {
- std::span work_scalars = scalars[msm.batch_msm_index];
- std::span work_points = points[msm.batch_msm_index];
- std::span work_indices =
- std::span{ &msm_scalar_indices[msm.batch_msm_index][msm.start_index], msm.size };
- std::vector point_schedule(msm.size);
- MSMData msm_data(work_scalars, work_points, work_indices, std::span(point_schedule));
- Element msm_result = Curve::Group::point_at_infinity;
- constexpr size_t SINGLE_MUL_THRESHOLD = 16;
- if (msm.size < SINGLE_MUL_THRESHOLD) {
- msm_result = small_mul(work_scalars, work_points, msm_data.scalar_indices, msm.size);
- } else {
- // Our non-affine method implicitly handles cases where Weierstrass edge cases may occur
- // Note: not as fast! use unsafe version if you know all input base points are linearly independent
- if (handle_edge_cases) {
- msm_result = small_pippenger_low_memory_with_transformed_scalars(msm_data);
- } else {
- msm_result = pippenger_low_memory_with_transformed_scalars(msm_data);
- }
- }
- msm_results.push_back(std::make_pair(msm_result, msm.batch_msm_index));
+ point_schedule_buffer.resize(msm.size);
+ MSMData msm_data =
+ MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
+ Element msm_result =
+ (msm.size < PIPPENGER_THRESHOLD) ? small_mul(msm_data) : pippenger_impl(msm_data);
+
+ msm_results.emplace_back(msm_result, msm.batch_msm_index);
}
}
});
@@ -805,61 +513,49 @@ std::vector MSM::batch_multi_scalar_mul(
// Accumulate results. This part needs to be single threaded, but amount of work done here should be small
// TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part
// will be painful.
- std::vector results(num_msms);
- for (Element& ele : results) {
- ele.self_set_infinity();
- }
+ std::vector results(num_msms, Curve::Group::point_at_infinity);
for (const auto& single_thread_msm_results : thread_msm_results) {
- for (const std::pair& result : single_thread_msm_results) {
- results[result.second] += result.first;
+ for (const auto& [element, index] : single_thread_msm_results) {
+ results[index] += element;
}
}
- Element::batch_normalize(&results[0], num_msms);
+ Element::batch_normalize(results.data(), num_msms);
- std::vector affine_results;
- for (const auto& ele : results) {
- affine_results.emplace_back(AffineElement(ele.x, ele.y));
- }
-
- // Convert our scalars back into Montgomery form so they remain unchanged
- for (auto& msm_scalars : scalars) {
- parallel_for_range(msm_scalars.size(), [&](size_t start, size_t end) {
+ // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective
+ for (auto& scalar_span : scalars) {
+ parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
- msm_scalars[i].self_to_montgomery_form();
+ scalar_span[i].self_to_montgomery_form();
}
});
}
- return affine_results;
+
+ return std::vector(results.begin(), results.end());
}
-/**
- * @brief Helper method to evaluate a single MSM. Internally calls `batch_multi_scalar_mul`
- *
- * @tparam Curve
- * @param points
- * @param _scalars
- * @return Curve::AffineElement
- */
template
typename Curve::AffineElement MSM::msm(std::span points,
- PolynomialSpan _scalars,
+ PolynomialSpan scalars,
bool handle_edge_cases) noexcept
{
- if (_scalars.size() == 0) {
+ if (scalars.size() == 0) {
return Curve::Group::affine_point_at_infinity;
}
- BB_ASSERT_GTE(points.size(), _scalars.start_index + _scalars.size());
-
- // unfortnately we need to remove const on this data type to prevent duplicating _scalars (which is typically
- // large) We need to convert `_scalars` out of montgomery form for the MSM. We then convert the scalars back
- // into Montgomery form at the end of the algorithm. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
- // TODO(https://github.com/AztecProtocol/barretenberg/issues/1449): handle const correctness.
- ScalarField* scalars = const_cast(&_scalars[_scalars.start_index]);
-
- std::vector> pp{ points.subspan(_scalars.start_index) };
- std::vector> ss{ std::span(scalars, _scalars.size()) };
- AffineElement result = batch_multi_scalar_mul(pp, ss, handle_edge_cases)[0];
- return result;
+ const size_t num_scalars = scalars.size();
+ BB_ASSERT_GTE(points.size(), scalars.start_index + num_scalars);
+
+ // const_cast is safe: we convert from Montgomery, compute, then convert back.
+ // Scalars are unchanged from the caller's perspective.
+ // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+ ScalarField* scalar_ptr = const_cast(&scalars[scalars.start_index]);
+ std::span scalar_span(scalar_ptr, num_scalars);
+
+ // Wrap into a size-1 batch and delegate to the general method that properly handles multi-threading
+ std::array, 1> points_batch{ points.subspan(scalars.start_index) };
+ std::array, 1> scalars_batch{ scalar_span };
+
+ auto results = batch_multi_scalar_mul(std::span(points_batch), std::span(scalars_batch), handle_edge_cases);
+ return results[0];
}
template
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp
index 448c61b99421..0f410bfe1a63 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp
@@ -12,10 +12,8 @@
#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"
#include "barretenberg/polynomials/polynomial.hpp"
-#include "./process_buckets.hpp"
-#include "./scalar_multiplication.hpp"
-
#include "./bitvector.hpp"
+#include "./process_buckets.hpp"
namespace bb::scalar_multiplication {
template class MSM {
@@ -25,15 +23,70 @@ template class MSM {
using BaseField = typename Curve::BaseField;
using AffineElement = typename Curve::AffineElement;
- using G1 = AffineElement;
static constexpr size_t NUM_BITS_IN_FIELD = ScalarField::modulus.get_msb() + 1;
+ // ======================= Algorithm Tuning Constants =======================
+ //
+ // These constants control the behavior of the Pippenger MSM algorithm.
+ // They are empirically tuned for performance on typical hardware.
+
+ // Below this threshold, use naive scalar multiplication instead of Pippenger
+ static constexpr size_t PIPPENGER_THRESHOLD = 16;
+
+ // Below this threshold, the affine batch inversion trick is not beneficial
+ // (cost of inversions exceeds savings from cheaper affine additions)
+ static constexpr size_t AFFINE_TRICK_THRESHOLD = 128;
+
+ // Maximum bits per scalar slice (2^20 = 1M buckets, far beyond practical use)
+ static constexpr size_t MAX_SLICE_BITS = 20;
+
+ // Number of points to look ahead for memory prefetching
+ static constexpr size_t PREFETCH_LOOKAHEAD = 32;
+
+ // Prefetch every N iterations (must be power of 2); mask is N-1 for efficient modulo
+ static constexpr size_t PREFETCH_INTERVAL = 16;
+ static constexpr size_t PREFETCH_INTERVAL_MASK = PREFETCH_INTERVAL - 1;
+
+ // ======================= Cost Model Constants =======================
+ //
+ // These constants define the relative costs of various operations,
+ // used to decide between algorithm variants.
+
+ // Cost of bucket accumulation relative to a single point addition
+ // (2 Jacobian adds per bucket, each ~2.5x cost of affine add)
+ static constexpr size_t BUCKET_ACCUMULATION_COST = 5;
+
+ // Field multiplications saved per group operation when using affine trick
+ static constexpr size_t AFFINE_TRICK_SAVINGS_PER_OP = 5;
+
+ // Extra cost of Jacobian group operation when Z coordinate != 1
+ static constexpr size_t JACOBIAN_Z_NOT_ONE_PENALTY = 5;
+
+ // Cost of computing 4-bit lookup table for modular exponentiation (14 muls)
+ static constexpr size_t INVERSION_TABLE_COST = 14;
+ // ===========================================================================
+
+ // Offset generator used in bucket reduction to probabilistically avoid incomplete-addition
+ // edge cases in the accumulator. Derived from domain-separated precomputed generators.
+ static const AffineElement& get_offset_generator() noexcept
+ {
+ static const AffineElement offset_generator = []() {
+ if constexpr (std::same_as) {
+ return get_precomputed_generators()[0];
+ } else {
+ return get_precomputed_generators()[0];
+ }
+ }();
+ return offset_generator;
+ }
+
/**
* @brief MSMWorkUnit describes an MSM that may be part of a larger MSM
* @details For a multi-MSM where each MSM has a variable size, we want to split the MSMs up
* such that every available thread has an equal amount of MSM work to perform.
- * The actual MSM algorithm used is single-threaded. This is beneficial because we get better scaling.
- *
+ * Each work unit is computed single-threaded; a single MSM may be split across
+ * threads and reduced. This approach yields better scaling than thread-parallel
+ * bucket accumulation.
*/
struct MSMWorkUnit {
size_t batch_msm_index = 0;
@@ -42,15 +95,43 @@ template class MSM {
};
using ThreadWorkUnits = std::vector;
+ /**
+ * @brief Container for MSM input data passed between algorithm stages
+ * @note scalars must be in NON-Montgomery form for correct bucket index computation
+ */
struct MSMData {
- std::span scalars;
- std::span points;
- std::span scalar_indices;
- std::span point_schedule;
+ std::span scalars; // Scalars (non-Montgomery form)
+ std::span points; // Input points
+ std::span scalar_indices; // Indices of nonzero scalars
+ std::span point_schedule; // Scratch space for point scheduling
+
+ /**
+ * @brief Factory method to construct MSMData from a work unit
+ * @details Extracts the appropriate slices from the full arrays based on MSMWorkUnit parameters
+ */
+ static MSMData from_work_unit(std::span> all_scalars,
+ std::span> all_points,
+ const std::vector>& all_indices,
+ std::span point_schedule_buffer,
+ const MSMWorkUnit& work_unit) noexcept
+ {
+ return MSMData{
+ .scalars = all_scalars[work_unit.batch_msm_index],
+ .points = all_points[work_unit.batch_msm_index],
+ .scalar_indices =
+ std::span{ &all_indices[work_unit.batch_msm_index][work_unit.start_index],
+ work_unit.size },
+ .point_schedule = point_schedule_buffer,
+ };
+ }
};
/**
- * @brief Temp data structure, one created per thread!
+ * @brief Affine bucket accumulators for the fast affine-trick Pippenger variant
+ * @details Used when handle_edge_cases=false. Stores buckets in affine coordinates,
+ * enabling use of Montgomery's batch inversion trick. Does NOT handle
+ * edge cases like point doubling or point at infinity.
+ * @note Allocated per-call for WASM compatibility.
*/
struct BucketAccumulators {
std::vector buckets;
@@ -62,6 +143,13 @@ template class MSM {
{}
};
+ /**
+ * @brief Jacobian bucket accumulators for the safe Pippenger variant
+ * @details Used when handle_edge_cases=true or when affine trick is not beneficial.
+ * Stores buckets in Jacobian coordinates which correctly handle point
+ * doubling and point at infinity edge cases.
+ * @note Allocated per-call (not thread_local) in the Jacobian Pippenger path.
+ */
struct JacobianBucketAccumulators {
std::vector buckets;
BitVector bucket_exists;
@@ -72,81 +160,99 @@ template class MSM {
{}
};
/**
- * @brief Temp data structure, one created per thread!
+ * @brief Scratch space for batched affine point additions (one per thread)
*/
struct AffineAdditionData {
static constexpr size_t BATCH_SIZE = 2048;
// when adding affine points, we have an edge case where the number of points in the batch can overflow by 2
static constexpr size_t BATCH_OVERFLOW_SIZE = 2;
std::vector points_to_add;
- std::vector scalar_scratch_space;
- std::vector addition_result_bucket_destinations;
+ std::vector inversion_scratch_space; // Used for Montgomery batch inversion denominators
+ std::vector addition_result_bucket_destinations;
+ AffineElement null_location{}; // Dummy write target for branchless conditional moves
AffineAdditionData() noexcept
: points_to_add(BATCH_SIZE + BATCH_OVERFLOW_SIZE)
- , scalar_scratch_space(BATCH_SIZE + BATCH_OVERFLOW_SIZE)
+ , inversion_scratch_space(BATCH_SIZE + BATCH_OVERFLOW_SIZE)
, addition_result_bucket_destinations(((BATCH_SIZE + BATCH_OVERFLOW_SIZE) / 2))
{}
};
- static size_t get_num_rounds(size_t num_points) noexcept
+
+ /**
+ * @brief Packed point schedule entry: (point_index << 32) | bucket_index
+ * @details Used to sort points by their target bucket for cache-efficient processing
+ */
+ struct PointScheduleEntry {
+ uint64_t data;
+
+ [[nodiscard]] static constexpr PointScheduleEntry create(uint32_t point_index, uint32_t bucket_index) noexcept
+ {
+ return { (static_cast(point_index) << 32) | bucket_index };
+ }
+ [[nodiscard]] constexpr uint32_t point_index() const noexcept { return static_cast(data >> 32); }
+ [[nodiscard]] constexpr uint32_t bucket_index() const noexcept { return static_cast(data); }
+ };
+
+ // ======================= Public Methods =======================
+ // See README.md for algorithm details and mathematical derivations.
+
+ /**
+ * @brief Main entry point for single MSM computation
+ * @param handle_edge_cases false (default): fast affine variant; true: safe Jacobian variant
+ * @note Scalars are temporarily modified but restored before returning
+ */
+ static AffineElement msm(std::span points,
+ PolynomialSpan scalars,
+ bool handle_edge_cases = false) noexcept;
+
+ /**
+ * @brief Compute multiple MSMs in parallel with work balancing
+ * @note Scalars are temporarily modified but restored before returning
+ * @see README.md "Parallelization"
+ */
+ static std::vector batch_multi_scalar_mul(std::span> points,
+ std::span> scalars,
+ bool handle_edge_cases = true) noexcept;
+
+ // ======================= Test-Visible Methods =======================
+ // Exposed for unit testing; not part of the public API.
+
+ static uint32_t get_num_rounds(size_t num_points) noexcept
{
- const size_t bits_per_slice = get_optimal_log_num_buckets(num_points);
- const size_t num_rounds = (NUM_BITS_IN_FIELD + (bits_per_slice - 1)) / bits_per_slice;
- return num_rounds;
+ const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points);
+ return static_cast((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
}
+
+ /** @brief Batch add n/2 independent point pairs using Montgomery's trick */
static void add_affine_points(AffineElement* points,
const size_t num_points,
typename Curve::BaseField* scratch_space) noexcept;
- static void transform_scalar_and_get_nonzero_scalar_indices(std::span scalars,
- std::vector& consolidated_indices) noexcept;
- static std::vector get_work_units(std::span> scalars,
- std::vector>& msm_scalar_indices) noexcept;
- static uint32_t get_scalar_slice(const ScalarField& scalar, size_t round, size_t normal_slice_size) noexcept;
- static size_t get_optimal_log_num_buckets(const size_t num_points) noexcept;
- static bool use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept;
-
- static Element small_pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept;
- static Element pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept;
- static Element evaluate_small_pippenger_round(MSMData& msm_data,
- const size_t round_index,
- JacobianBucketAccumulators& bucket_data,
- Element previous_round_output,
- const size_t bits_per_slice) noexcept;
-
- static Element evaluate_pippenger_round(MSMData& msm_data,
- const size_t round_index,
- AffineAdditionData& affine_data,
- BucketAccumulators& bucket_data,
- Element previous_round_output,
- const size_t bits_per_slice) noexcept;
-
- static void consume_point_schedule(std::span point_schedule,
- std::span points,
- AffineAdditionData& affine_data,
- BucketAccumulators& bucket_data,
- size_t num_input_points_processed,
- size_t num_queued_affine_points) noexcept;
+ /** @brief Extract c-bit slice from scalar for bucket index computation */
+ static uint32_t get_scalar_slice(const ScalarField& scalar, size_t round, size_t slice_size) noexcept;
- static std::vector batch_multi_scalar_mul(std::span> points,
- std::span> scalars,
- bool handle_edge_cases = true) noexcept;
- static AffineElement msm(std::span points,
- PolynomialSpan _scalars,
- bool handle_edge_cases = false) noexcept;
+ /** @brief Compute optimal bits per slice by minimizing cost over c in [1, MAX_SLICE_BITS) */
+ static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept;
+ /** @brief Process sorted point schedule into bucket accumulators using batched affine additions */
+ static void batch_accumulate_points_into_buckets(std::span point_schedule,
+ std::span points,
+ AffineAdditionData& affine_data,
+ BucketAccumulators& bucket_data) noexcept;
+
+ /** @brief Reduce buckets to single point using running (suffix) sum from high to low: R = sum(k * B_k) */
template static Element accumulate_buckets(BucketType& bucket_accumulators) noexcept
{
auto& buckets = bucket_accumulators.buckets;
BB_ASSERT_DEBUG(buckets.size() > static_cast(0));
int starting_index = static_cast(buckets.size() - 1);
- Element prefix_sum;
+ Element running_sum;
bool found_start = false;
while (!found_start && starting_index > 0) {
const size_t idx = static_cast(starting_index);
if (bucket_accumulators.bucket_exists.get(idx)) {
- prefix_sum = buckets[idx];
+ running_sum = buckets[idx];
found_start = true;
} else {
starting_index -= 1;
@@ -156,32 +262,104 @@ template class MSM {
return Curve::Group::point_at_infinity;
}
BB_ASSERT_DEBUG(starting_index > 0);
- AffineElement offset_generator = Curve::Group::affine_point_at_infinity;
- if constexpr (std::same_as) {
- constexpr auto gen = get_precomputed_generators()[0];
- offset_generator = gen;
- } else {
- constexpr auto gen = get_precomputed_generators()[0];
- offset_generator = gen;
- }
- Element sum = prefix_sum + offset_generator;
- for (int i = static_cast(starting_index - 1); i > 0; --i) {
+ const auto& offset_generator = get_offset_generator();
+ Element sum = running_sum + offset_generator;
+ for (int i = starting_index - 1; i > 0; --i) {
size_t idx = static_cast(i);
BB_ASSERT_DEBUG(idx < bucket_accumulators.bucket_exists.size());
if (bucket_accumulators.bucket_exists.get(idx)) {
-
- prefix_sum += buckets[idx];
+ running_sum += buckets[idx];
}
- sum += prefix_sum;
+ sum += running_sum;
}
return sum - offset_generator;
}
+
+ private:
+ // ======================= Private Implementation =======================
+
+ /** @brief Convert scalars from Montgomery form and collect indices of nonzero scalars */
+ static void transform_scalar_and_get_nonzero_scalar_indices(std::span scalars,
+ std::vector& nonzero_scalar_indices) noexcept;
+
+ /** @brief Distribute multiple MSMs across threads with balanced point counts */
+ static std::vector get_work_units(std::span> scalars,
+ std::vector>& msm_scalar_indices) noexcept;
+
+ /** @brief Decide if batch inversion saves work vs Jacobian additions */
+ static bool use_affine_trick(size_t num_points, size_t num_buckets) noexcept;
+
+ /** @brief Pippenger using Jacobian buckets (handles edge cases: doubling, infinity) */
+ static Element jacobian_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept;
+
+ /** @brief Pippenger using affine buckets with batch inversion (faster, no edge case handling) */
+ static Element affine_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept;
+
+ // Helpers for batch_accumulate_points_into_buckets. Inlined for performance.
+
+ // Process single point: if bucket has accumulator, pair them for addition; else cache in bucket.
+ __attribute__((always_inline)) static void process_single_point(size_t bucket,
+ const AffineElement* point_source,
+ AffineAdditionData& affine_data,
+ BucketAccumulators& bucket_data,
+ size_t& scratch_it,
+ size_t& point_it) noexcept
+ {
+ bool has_accumulator = bucket_data.bucket_exists.get(bucket);
+ if (has_accumulator) {
+ affine_data.points_to_add[scratch_it] = *point_source;
+ affine_data.points_to_add[scratch_it + 1] = bucket_data.buckets[bucket];
+ bucket_data.bucket_exists.set(bucket, false);
+ affine_data.addition_result_bucket_destinations[scratch_it >> 1] = static_cast(bucket);
+ scratch_it += 2;
+ } else {
+ bucket_data.buckets[bucket] = *point_source;
+ bucket_data.bucket_exists.set(bucket, true);
+ }
+ point_it += 1;
+ }
+
+ // Branchless bucket pair processing. Updates point_it (by 2 if same bucket, else 1) and scratch_it.
+ // See README.md "batch_accumulate_points_into_buckets Algorithm" for case analysis.
+ __attribute__((always_inline)) static void process_bucket_pair(size_t lhs_bucket,
+ size_t rhs_bucket,
+ const AffineElement* lhs_source,
+ const AffineElement* rhs_source_if_match,
+ AffineAdditionData& affine_data,
+ BucketAccumulators& bucket_data,
+ size_t& scratch_it,
+ size_t& point_it) noexcept
+ {
+ bool has_bucket_accumulator = bucket_data.bucket_exists.get(lhs_bucket);
+ bool buckets_match = lhs_bucket == rhs_bucket;
+ bool do_affine_add = buckets_match || has_bucket_accumulator;
+
+ const AffineElement* rhs_source = buckets_match ? rhs_source_if_match : &bucket_data.buckets[lhs_bucket];
+
+ AffineElement* lhs_destination =
+ do_affine_add ? &affine_data.points_to_add[scratch_it] : &bucket_data.buckets[lhs_bucket];
+ AffineElement* rhs_destination =
+ do_affine_add ? &affine_data.points_to_add[scratch_it + 1] : &affine_data.null_location;
+
+ uint32_t& dest_bucket = affine_data.addition_result_bucket_destinations[scratch_it >> 1];
+ dest_bucket = do_affine_add ? static_cast(lhs_bucket) : dest_bucket;
+
+ *lhs_destination = *lhs_source;
+ *rhs_destination = *rhs_source;
+
+ bucket_data.bucket_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
+ scratch_it += do_affine_add ? 2 : 0;
+ point_it += (do_affine_add && buckets_match) ? 2 : 1;
+ }
};
+/** @brief Safe MSM wrapper (defaults to handle_edge_cases=true) */
template
typename Curve::Element pippenger(PolynomialSpan scalars,
std::span points,
bool handle_edge_cases = true) noexcept;
+
+/** @brief Fast MSM wrapper for linearly independent points (no edge case handling) */
template
typename Curve::Element pippenger_unsafe(PolynomialSpan scalars,
std::span points) noexcept;
@@ -189,5 +367,4 @@ typename Curve::Element pippenger_unsafe(PolynomialSpan;
extern template class MSM;
-// NEXT STEP ACCUMULATE BUVKETS
} // namespace bb::scalar_multiplication
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
index f1b79f2d1f3c..b1d2ee42801a 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
@@ -55,6 +55,7 @@ template class ScalarMultiplicationTest : public ::testing::Test {
}
return AffineElement(expected_acc);
}
+
static void SetUpTestSuite()
{
generators.resize(num_points);
@@ -69,502 +70,573 @@ template class ScalarMultiplicationTest : public ::testing::Test {
ASSERT_EQ(generators[i].x == generators[i + 1].x, false);
}
};
-};
-using CurveTypes = ::testing::Types;
-TYPED_TEST_SUITE(ScalarMultiplicationTest, CurveTypes);
+ // ======================= Test Methods =======================
-#define SCALAR_MULTIPLICATION_TYPE_ALIASES \
- using Curve = TypeParam; \
- using ScalarField = typename Curve::ScalarField; \
- // using AffineElement = typename Curve::AffineEleent;
+ void test_get_scalar_slice()
+ {
+ constexpr uint32_t fr_size = 254;
+ constexpr uint32_t slice_bits = 7;
+ constexpr uint32_t num_slices = (fr_size + 6) / 7;
+ constexpr uint32_t last_slice_bits = fr_size - ((num_slices - 1) * slice_bits);
+
+ for (size_t x = 0; x < 100; ++x) {
+ uint256_t input_u256 = engine.get_random_uint256();
+ input_u256.data[3] = input_u256.data[3] & 0x3FFFFFFFFFFFFFFF; // 254 bits
+ while (input_u256 > ScalarField::modulus) {
+ input_u256 -= ScalarField::modulus;
+ }
+ std::vector slices(num_slices);
+
+ uint256_t acc = input_u256;
+ for (uint32_t i = 0; i < num_slices; ++i) {
+ uint32_t mask = ((1U << slice_bits) - 1U);
+ uint32_t shift = slice_bits;
+ if (i == 0) {
+ mask = ((1U << last_slice_bits) - 1U);
+ shift = last_slice_bits;
+ }
+ slices[num_slices - 1 - i] = static_cast((acc & mask).data[0]);
+ acc = acc >> shift;
+ }
-TYPED_TEST(ScalarMultiplicationTest, GetScalarSlice)
-{
- SCALAR_MULTIPLICATION_TYPE_ALIASES
- const size_t fr_size = 254;
- const size_t slice_bits = 7;
- size_t num_slices = (fr_size + 6) / 7;
- size_t last_slice_bits = fr_size - ((num_slices - 1) * slice_bits);
-
- for (size_t x = 0; x < 100; ++x) {
-
- uint256_t input_u256 = engine.get_random_uint256();
- input_u256.data[3] = input_u256.data[3] & 0x3FFFFFFFFFFFFFFF; // 254 bits
- while (input_u256 > ScalarField::modulus) {
- input_u256 -= ScalarField::modulus;
- }
- std::vector slices(num_slices);
-
- uint256_t acc = input_u256;
- for (size_t i = 0; i < num_slices; ++i) {
- size_t mask = ((1 << slice_bits) - 1UL);
- size_t shift = slice_bits;
- if (i == 0) {
- mask = ((1UL << last_slice_bits) - 1UL);
- shift = last_slice_bits;
+ ScalarField input(input_u256);
+ input.self_from_montgomery_form();
+
+ ASSERT_EQ(input.data[0], input_u256.data[0]);
+ ASSERT_EQ(input.data[1], input_u256.data[1]);
+ ASSERT_EQ(input.data[2], input_u256.data[2]);
+ ASSERT_EQ(input.data[3], input_u256.data[3]);
+
+ for (uint32_t i = 0; i < num_slices; ++i) {
+ uint32_t result = scalar_multiplication::MSM::get_scalar_slice(input, i, slice_bits);
+ EXPECT_EQ(result, slices[i]);
}
- slices[num_slices - 1 - i] = static_cast((acc & mask).data[0]);
- acc = acc >> shift;
}
- // uint256_t input_u256 = 0;
-
- // for (size_t i = 0; i < num_slices; ++i) {
- // bool valid_slice = false;
- // while (!valid_slice) {
- // size_t mask = ((1 << slice_bits) - 1);
- // if (i == num_slices - 1) {
- // mask = ((1 << last_slice_bits) - 1);
- // }
- // const uint32_t slice = engine.get_random_uint32() & mask;
-
- // size_t shift = (fr_size - slice_bits - (i * slice_bits));
- // if (i == num_slices - 1) {
- // shift = 0;
- // }
-
- // const uint256_t new_input_u256 = input_u256 + (uint256_t(slice) << shift);
- // // BB_ASSERT(new_input_u256 < fr::modulus);
- // if (new_input_u256 < fr::modulus) {
- // input_u256 = new_input_u256;
- // slices[i] = slice;
- // valid_slice = true;
- // }
- // }
- // }
-
- // BB_ASSERT(input_u256 < fr::modulus);
- // while (input_u256 > fr::modulus) {
- // input_u256 -= fr::modulus;
- // }
- ScalarField input(input_u256);
- input.self_from_montgomery_form();
-
- ASSERT_EQ(input.data[0], input_u256.data[0]);
- ASSERT_EQ(input.data[1], input_u256.data[1]);
- ASSERT_EQ(input.data[2], input_u256.data[2]);
- ASSERT_EQ(input.data[3], input_u256.data[3]);
-
- for (size_t i = 0; i < num_slices; ++i) {
-
- uint32_t result = scalar_multiplication::MSM::get_scalar_slice(input, i, slice_bits);
- EXPECT_EQ(result, slices[i]);
+ }
+
+ void test_consume_point_batch()
+ {
+ const size_t total_points = 30071;
+ const size_t num_buckets = 128;
+
+ std::vector input_point_schedule;
+ for (size_t i = 0; i < total_points; ++i) {
+ uint64_t bucket = static_cast(engine.get_random_uint8()) & 0x7f;
+ uint64_t schedule = static_cast(bucket) + (static_cast(i) << 32);
+ input_point_schedule.push_back(schedule);
+ }
+ typename scalar_multiplication::MSM::AffineAdditionData affine_data;
+ typename scalar_multiplication::MSM::BucketAccumulators bucket_data(num_buckets);
+ scalar_multiplication::MSM::batch_accumulate_points_into_buckets(
+ input_point_schedule, generators, affine_data, bucket_data);
+
+ std::vector expected_buckets(num_buckets);
+ for (auto& e : expected_buckets) {
+ e.self_set_infinity();
+ }
+ for (size_t i = 0; i < total_points; ++i) {
+ uint64_t bucket = input_point_schedule[i] & 0xFFFFFFFF;
+ EXPECT_LT(static_cast(bucket), num_buckets);
+ expected_buckets[static_cast(bucket)] += generators[i];
+ }
+ for (size_t i = 0; i < num_buckets; ++i) {
+ if (!expected_buckets[i].is_point_at_infinity()) {
+ AffineElement expected(expected_buckets[i]);
+ EXPECT_EQ(expected, bucket_data.buckets[i]);
+ } else {
+ EXPECT_FALSE(bucket_data.bucket_exists.get(i));
+ }
}
}
- // fr test = 0;
- // test.data[0] = 0b;
- // test.data[1] = 0b010101
-}
-TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatch)
-{
- using Curve = TypeParam;
- using AffineElement = typename Curve::AffineElement;
- // todo make this not a multiple of 10k
- const size_t total_points = 30071;
- const size_t num_buckets = 128;
+ void test_consume_point_batch_and_accumulate()
+ {
+ const size_t total_points = 30071;
+ const size_t num_buckets = 128;
+
+ std::vector input_point_schedule;
+ for (size_t i = 0; i < total_points; ++i) {
+ uint64_t bucket = static_cast(engine.get_random_uint8()) & 0x7f;
+ uint64_t schedule = static_cast(bucket) + (static_cast(i) << 32);
+ input_point_schedule.push_back(schedule);
+ }
+ typename scalar_multiplication::MSM::AffineAdditionData affine_data;
+ typename scalar_multiplication::MSM