diff --git a/barretenberg/cpp/CLAUDE.md b/barretenberg/cpp/CLAUDE.md index dbe3aaa51ad1..e6d98e4776ca 100644 --- a/barretenberg/cpp/CLAUDE.md +++ b/barretenberg/cpp/CLAUDE.md @@ -2,6 +2,14 @@ succint aztec-packages cheat sheet. THE PROJECT ROOT IS AT TWO LEVELS ABOVE THIS FOLDER. Typically, the repository is at ~/aztec-packages. all advice is from the root. +# Git workflow for barretenberg + +**IMPORTANT**: When comparing branches or looking at diffs for barretenberg work, use `merge-train/barretenberg` as the base branch, NOT `master`. The master branch is often outdated for barretenberg development. + +Examples: +- `git diff merge-train/barretenberg...HEAD` (not `git diff master...HEAD`) +- `git log merge-train/barretenberg..HEAD` (not `git log master..HEAD`) + Run ./bootstrap.sh at the top-level to be sure the repo fully builds. Bootstrap scripts can be called with relative paths e.g. ../barretenberg/bootstrap.sh diff --git a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh index 630d928089d0..748bb978fef4 100755 --- a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh +++ b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh @@ -16,7 +16,7 @@ PRESET=${3:-clang20} BUILD_DIR=${4:-build} HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16} -BASELINE_BRANCH="master" +BASELINE_BRANCH="${BASELINE_BRANCH:-merge-train/barretenberg}" BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools" if [ ! -z "$(git status --untracked-files=no --porcelain)" ]; then diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md new file mode 100644 index 000000000000..17e135237eeb --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md @@ -0,0 +1,183 @@ +# Pippenger Multi-Scalar Multiplication (MSM) + +## Overview + +The Pippenger algorithm computes multi-scalar multiplications: + +$$\text{MSM}(\vec{s}, \vec{P}) = \sum_{i=0}^{n-1} s_i \cdot P_i$$ + +**Complexity**: Let $q = \lceil \log_2(\text{field modulus}) \rceil$ be the scalar bit-length, $|A|$ the cost of a group addition, and $|D|$ the cost of a doubling. + +- **Pippenger**: $O\left(\frac{q}{c} \cdot \left((n + 2^c) \cdot |A| + c \cdot |D|\right)\right)$ +- **Naive**: $O(n \cdot q \cdot |D| + n \cdot q \cdot |A| / 2)$ + +With $c \approx \frac{1}{2} \log_2 n$, Pippenger achieves roughly $O(n \cdot q / \log n)$ vs $O(n \cdot q)$ for naive scalar multiplication. + +## Algorithm + +### Step 1: Scalar Decomposition + +**Implementation**: `get_scalar_slice(scalar, round_index, bits_per_slice)` + +Each scalar $s_i$ is decomposed into $r$ slices of $c$ bits each, processed **MSB-first**: + +$$s_i = \sum_{j=0}^{r-1} s_i^{(j)} \cdot 2^{c(r-1-j)}$$ + +- $c$ = bits per slice (from `get_optimal_log_num_buckets`, which brute-force searches for minimum cost) +- $r = \lceil $ `NUM_BITS_IN_FIELD` $/ c \rceil$ = number of rounds +- Round 0 extracts the most significant bits + +### Step 2: Bucket Accumulation + +For each round $j$, points are added into **buckets** based on their scalar slice. Bucket $k$ accumulates all points whose slice value equals $k$: + +$$B_k^{(j)} = \sum_{\{i : s_i^{(j)} = k\}} P_i$$ + +**Two implementation paths:** + +- **Affine**: Sorts points by bucket and uses batched affine additions +- **Jacobian**: Direct bucket accumulation in Jacobian coordinates + +### Step 3: Bucket Reduction + +**Implementation**: `accumulate_buckets(bucket_accumulators)` + +Computes weighted sum using a suffix sum (high to low): + +$$R^{(j)} = \sum_{k=1}^{2^c - 1} k \cdot B_k^{(j)} = \sum_{k=1}^{2^c - 1} \left( \sum_{m=k}^{2^c - 1} B_m^{(j)} \right)$$ + +An offset generator is added and subtracted to avoid rare accumulator edge cases—a probabilistic mitigation that simplifies accumulation logic. + +### Step 4: Round Combination + +Combines all rounds using Horner's method (MSB-first): + +```cpp +msm_accumulator = point_at_infinity +for j = 0 to r-1: + repeat c doublings (or fewer for final round) + msm_accumulator += bucket_result[j] +``` + +## Algorithm Variants + +### Entry Points and Safety + +| Entry Point | Default | Safety | +|-------------|---------|--------| +| `msm()` | `handle_edge_cases=false` | ⚠️ **Unsafe** | +| `pippenger()` | `handle_edge_cases=true` | ✓ Safe | +| `pippenger_unsafe()` | `handle_edge_cases=false` | ⚠️ Unsafe | +| `batch_multi_scalar_mul()` | `handle_edge_cases=true` | ✓ Safe | + +### Edge Cases + +Affine addition fails for **P = Q** (doubling), **P = −Q** (inverse), and **P = O** (identity). Jacobian coordinates handle these correctly at higher cost (~2-3× slower). + +⚠️ **Use `msm()` or `pippenger_unsafe()` only when points are guaranteed linearly independent** (e.g., SRS points). For user-controlled or potentially duplicate points, use `pippenger()`. + +### Affine Pippenger (`handle_edge_cases=false`) + +Uses affine coordinates with Montgomery's batch inversion trick: replaces $m$ inversions with **1 inversion + O(m) multiplications**, yielding ~2-3× speedup over Jacobian. + +### Jacobian Pippenger (`handle_edge_cases=true`) + +Uses Jacobian coordinates for bucket accumulators. Handles all edge cases correctly. + +## Tuning Constants + +| Constant | Value | Purpose | +|----------|-------|---------| +| `PIPPENGER_THRESHOLD` | 16 | Below this, use naive scalar multiplication | +| `AFFINE_TRICK_THRESHOLD` | 128 | Below this, batch inversion overhead exceeds savings | +| `MAX_SLICE_BITS` | 20 | Upper bound on bucket count exponent | +| `BATCH_SIZE` | 2048 | Points per batch inversion (fits L2 cache) | +| `RADIX_BITS` | 8 | Bits per radix sort pass | + +
+Cost model constants and derivations + +| Constant | Value | Derivation | +|----------|-------|------------| +| `BUCKET_ACCUMULATION_COST` | 5 | 2 Jacobian adds/bucket × 2.5× cost ratio | +| `AFFINE_TRICK_SAVINGS_PER_OP` | 5 | ~10 muls saved − ~3 muls for product tree | +| `JACOBIAN_Z_NOT_ONE_PENALTY` | 5 | Extra field ops when Z ≠ 1 | +| `INVERSION_TABLE_COST` | 14 | 4-bit lookup table for modular exp | + +**BATCH_SIZE=2048**: Each `AffineElement` is 64 bytes. 2048 points = 128 KB, fitting in L2 cache. + +**RADIX_BITS=8**: 256 radix buckets × 4 bytes = 1 KB counting array, fits in L1 cache. + +
+ +## Implementation Notes + +### Zero Scalar Filtering + +`transform_scalar_and_get_nonzero_scalar_indices` filters out zero scalars before processing (since $0 \cdot P_i = \mathcal{O}$). Scalars are converted from Montgomery form in-place to avoid doubling memory usage. + +### Bucket Existence Tracking + +A `BitVector` bitmap tracks which buckets are populated, avoiding expensive full-array clears between rounds. Clearing the bitmap costs $O(2^c / 64)$ words vs $O(2^c)$ for the full bucket array. + +### Point Scheduling (Affine Variant Only) + +Entries are packed as `(point_index << 32) | bucket_index` into 64-bit values. Since bucket indices fit in $c$ bits (typically 8-16), they occupy only the lowest bits of the packed entry. An **in-place MSD radix sort** on the low $c$ bits groups points by bucket for efficient batch processing. The sort also detects entries with `bucket_index == 0` during the final radix pass, allowing zero-bucket entries to be skipped without a separate scan. + +### Batched Affine Addition + +`batch_accumulate_points_into_buckets` processes sorted points iteratively: +- Same-bucket pairs → queue for batch addition +- Different buckets → cache in bucket or queue with existing accumulator +- Uses branchless conditional moves to minimize pipeline stalls +- Prefetches future points to hide memory latency +- Recirculates results to maximize batch efficiency before writing to buckets + +
+Batch accumulation case analysis + +| Condition | Action | Iterator Update | +|-----------|--------|-----------------| +| `bucket[i] == bucket[i+1]` | Queue both points for batch add | `point_it += 2` | +| Different buckets, accumulator exists | Queue point + accumulator | `point_it += 1` | +| Different buckets, no accumulator | Cache point into bucket | `point_it += 1` | + +After batch addition, results targeting the same bucket are paired again before writing to bucket accumulators, reducing random memory access by ~50%. + +
+ +## Parallelization + +Uses **per-thread buffers** (bucket accumulators, scratch space) to eliminate contention. + +For `batch_multi_scalar_mul()`, work is distributed via `MSMWorkUnit` structures that can split a single MSM across multiple threads. Each thread computes partial results on point subsets, combined in a final reduction. + +
+Per-call buffer sizes + +| Buffer | Size | Purpose | +|--------|------|---------| +| `BucketAccumulators` (affine) | $2^c × 64$ bytes | Affine bucket array + bitmap | +| `JacobianBucketAccumulators` | $2^c × 96$ bytes | Jacobian bucket array + bitmap | +| `AffineAdditionData` | ~400 KB | Scratch for batch inversion | +| `point_schedule` | $n × 8$ bytes | Per-MSM point schedule | + +Buffers are allocated per-call for WASM compatibility. Memory scales with thread count during parallel execution. + +
+ +## File Structure + +``` +scalar_multiplication/ +├── scalar_multiplication.hpp # MSM class, data structures +├── scalar_multiplication.cpp # Core algorithm +├── process_buckets.hpp/cpp # Radix sort +├── bitvector.hpp # Bit vector for bucket tracking +└── README.md # This file +``` + +## References + +1. Pippenger, N. (1976). "On the evaluation of powers and related problems" +2. Bernstein, D.J. et al. "Faster batch forgery identification" (batch inversion) diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp index a6e767961b44..c527b7b7b1b4 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp @@ -10,89 +10,97 @@ namespace bb::scalar_multiplication { -// NOLINTNEXTLINE(misc-no-recursion) recursion is fine here, max recursion depth is 8 (64 bit int / 8 bits per call) +// NOLINTNEXTLINE(misc-no-recursion) recursion is fine here, max depth is 4 (32-bit bucket index / 8 bits per call) void radix_sort_count_zero_entries(uint64_t* keys, const size_t num_entries, const uint32_t shift, size_t& num_zero_entries, - const uint32_t total_bits, - const uint64_t* start_pointer) noexcept + const uint32_t bucket_index_bits, + const uint64_t* top_level_keys) noexcept { - constexpr size_t num_bits = 8; - constexpr size_t num_buckets = 1UL << num_bits; - constexpr uint32_t mask = static_cast(num_buckets) - 1U; - std::array bucket_counts{}; + constexpr size_t NUM_RADIX_BUCKETS = 1UL << RADIX_BITS; + constexpr uint32_t RADIX_MASK = static_cast(NUM_RADIX_BUCKETS) - 1U; + // Step 1: Count entries in each radix bucket + std::array bucket_counts{}; for (size_t i = 0; i < num_entries; ++i) { - bucket_counts[(keys[i] >> shift) & mask]++; + bucket_counts[(keys[i] >> shift) & RADIX_MASK]++; } - std::array offsets; - std::array offsets_copy; + // Step 2: Convert counts to cumulative offsets (prefix sum) + std::array offsets; + std::array offsets_copy; offsets[0] = 0; - - for (size_t i = 0; i < num_buckets - 1; ++i) { + for (size_t i = 0; i < NUM_RADIX_BUCKETS - 1; ++i) { bucket_counts[i + 1] += bucket_counts[i]; } - if ((shift == 0) && (keys == start_pointer)) { + + // Count zero entries only at the final recursion level (shift == 0) and only for the full array + if ((shift == 0) && (keys == top_level_keys)) { num_zero_entries = bucket_counts[0]; } - for (size_t i = 1; i < num_buckets + 1; ++i) { + + for (size_t i = 1; i < NUM_RADIX_BUCKETS + 1; ++i) { offsets[i] = bucket_counts[i - 1]; } - for (size_t i = 0; i < num_buckets + 1; ++i) { + for (size_t i = 0; i < NUM_RADIX_BUCKETS + 1; ++i) { offsets_copy[i] = offsets[i]; } - uint64_t* start = &keys[0]; - for (size_t i = 0; i < num_buckets; ++i) { + // Step 3: In-place permutation using cycle sort + // For each radix bucket, repeatedly swap elements to their correct positions until all elements + // in that bucket's range belong there. The offsets array tracks the next write position for each bucket. + uint64_t* start = &keys[0]; + for (size_t i = 0; i < NUM_RADIX_BUCKETS; ++i) { uint64_t* bucket_start = &keys[offsets[i]]; const uint64_t* bucket_end = &keys[offsets_copy[i + 1]]; while (bucket_start != bucket_end) { for (uint64_t* it = bucket_start; it < bucket_end; ++it) { - const size_t value = (*it >> shift) & mask; + const size_t value = (*it >> shift) & RADIX_MASK; const uint64_t offset = offsets[value]++; std::iter_swap(it, start + offset); } bucket_start = &keys[offsets[i]]; } } + + // Step 4: Recursively sort each bucket by the next less-significant byte if (shift > 0) { - for (size_t i = 0; i < num_buckets; ++i) { - if (offsets_copy[i + 1] - offsets_copy[i] > 1) { - radix_sort_count_zero_entries(&keys[offsets_copy[i]], - offsets_copy[i + 1] - offsets_copy[i], - shift - 8, - num_zero_entries, - total_bits, - keys); + for (size_t i = 0; i < NUM_RADIX_BUCKETS; ++i) { + const size_t bucket_size = offsets_copy[i + 1] - offsets_copy[i]; + if (bucket_size > 1) { + radix_sort_count_zero_entries( + &keys[offsets_copy[i]], bucket_size, shift - RADIX_BITS, num_zero_entries, bucket_index_bits, keys); } } } } -size_t process_buckets_count_zero_entries(uint64_t* wnaf_entries, - const size_t num_entries, - const uint32_t num_bits) noexcept +size_t sort_point_schedule_and_count_zero_buckets(uint64_t* point_schedule, + const size_t num_entries, + const uint32_t bucket_index_bits) noexcept { if (num_entries == 0) { return 0; } - const uint32_t bits_per_round = 8; - const uint32_t base = num_bits & 7; - const uint32_t total_bits = (base == 0) ? num_bits : num_bits - base + 8; - const uint32_t shift = total_bits - bits_per_round; + + // Round bucket_index_bits up to next multiple of RADIX_BITS for proper MSD radix sort alignment. + // E.g., if bucket_index_bits=10, we need to start sorting from bit 16 (2 bytes) not bit 10. + const uint32_t remainder = bucket_index_bits % RADIX_BITS; + const uint32_t padded_bits = (remainder == 0) ? bucket_index_bits : bucket_index_bits - remainder + RADIX_BITS; + const uint32_t initial_shift = padded_bits - RADIX_BITS; + size_t num_zero_entries = 0; - radix_sort_count_zero_entries(wnaf_entries, num_entries, shift, num_zero_entries, num_bits, wnaf_entries); - - // inside radix_sort_count_zero_entries, if the least significant *byte* of `wnaf_entries[0] == 0`, - // then num_nonzero_entries = number of entries that share the same value as wnaf_entries[0]. - // If wnaf_entries[0] != 0, we must manually set num_zero_entries = 0 - if (num_entries > 0) { - if ((wnaf_entries[0] & 0xffffffff) != 0) { - num_zero_entries = 0; - } + radix_sort_count_zero_entries( + point_schedule, num_entries, initial_shift, num_zero_entries, bucket_index_bits, point_schedule); + + // The radix sort counts entries where the least significant BYTE is zero, but we need entries where + // the entire bucket_index (lower 32 bits) is zero. Verify the first entry after sorting. + if ((point_schedule[0] & BUCKET_INDEX_MASK) != 0) { + num_zero_entries = 0; } + return num_zero_entries; } + } // namespace bb::scalar_multiplication diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp index e964fcc7adf0..7c5a8c68b881 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp @@ -10,13 +10,47 @@ #include namespace bb::scalar_multiplication { + +// Number of bits processed per radix sort pass +constexpr uint32_t RADIX_BITS = 8; + +// Mask for extracting bucket index from point schedule entry (lower 32 bits) +constexpr uint64_t BUCKET_INDEX_MASK = 0xffffffff; + +/** + * @brief Recursive MSD radix sort that also counts entries with zero bucket index + * @details Sorts point schedule entries by their bucket index (lower 32 bits) using most-significant-digit + * radix sort. Processes RADIX_BITS bits per recursive call, starting from the most significant byte. + * As a side effect, counts entries where bucket_index == 0 (which are skipped during MSM). + * + * @param keys Array of point schedule entries to sort in-place (format: point_index << 32 | bucket_index) + * @param num_entries Number of entries to sort + * @param shift Current bit position to sort on (starts high, decreases by RADIX_BITS each recursion) + * @param[out] num_zero_entries Count of entries with bucket_index == 0 (only set at final recursion level) + * @param bucket_index_bits Number of bits in the bucket index (used to determine recursion depth) + * @param top_level_keys Pointer to original array start; used to detect top-level call for zero counting + */ void radix_sort_count_zero_entries(uint64_t* keys, - const size_t num_entries, - const uint32_t shift, + size_t num_entries, + uint32_t shift, size_t& num_zero_entries, - const uint32_t total_bits, - const uint64_t* start_pointer) noexcept; -size_t process_buckets_count_zero_entries(uint64_t* wnaf_entries, - const size_t num_entries, - const uint32_t num_bits) noexcept; + uint32_t bucket_index_bits, + const uint64_t* top_level_keys) noexcept; + +/** + * @brief Sort point schedule by bucket index and count zero-bucket entries + * @details Entry point for radix sort. Sorts entries so that points targeting the same bucket + * are contiguous, improving cache locality during bucket accumulation. Also counts + * entries with bucket_index == 0, which represent scalar bits that don't contribute + * to any bucket in the current Pippenger round. + * + * @param point_schedule Array of packed entries (point_index << 32 | bucket_index) to sort in-place + * @param num_entries Number of entries + * @param bucket_index_bits Number of bits used for bucket indices (determines sort range) + * @return Number of entries with bucket_index == 0 (these are at the start after sorting) + */ +size_t sort_point_schedule_and_count_zero_buckets(uint64_t* point_schedule, + size_t num_entries, + uint32_t bucket_index_bits) noexcept; + } // namespace bb::scalar_multiplication diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp index d4199c0e6cd4..7e73e8ddbe66 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp @@ -21,21 +21,14 @@ namespace bb::scalar_multiplication { -/** - * @brief Fallback method for very small numbers of input points - * - * @tparam Curve - * @param scalars - * @param points - * @param range - * @return Curve::Element - */ -template -typename Curve::Element small_mul(std::span& scalars, - std::span& points, - std::span scalar_indices, - size_t range) noexcept +// Naive double-and-add fallback for small inputs (< PIPPENGER_THRESHOLD points). +template typename Curve::Element small_mul(const typename MSM::MSMData& msm_data) noexcept { + const auto& scalars = msm_data.scalars; + const auto& points = msm_data.points; + const auto& scalar_indices = msm_data.scalar_indices; + const size_t range = scalar_indices.size(); + typename Curve::Element r = Curve::Group::point_at_infinity; for (size_t i = 0; i < range; ++i) { typename Curve::Element f = points[scalar_indices[i]]; @@ -44,21 +37,14 @@ typename Curve::Element small_mul(std::span& return r; } -/** - * @brief Convert scalar out of Montgomery form. Populate `consolidated_indices` with nonzero scalar indices - * - * @tparam Curve - * @param scalars - * @param consolidated_indices - */ template void MSM::transform_scalar_and_get_nonzero_scalar_indices(std::span scalars, - std::vector& consolidated_indices) noexcept + std::vector& nonzero_scalar_indices) noexcept { std::vector> thread_indices(get_num_cpus()); + // Pass 1: Each thread converts from Montgomery and collects nonzero indices into its own vector parallel_for([&](const ThreadChunk& chunk) { - // parallel_for with ThreadChunk uses get_num_cpus() threads BB_ASSERT_EQ(chunk.total_threads, thread_indices.size()); auto range = chunk.range(scalars.size()); if (range.empty()) { @@ -71,9 +57,7 @@ void MSM::transform_scalar_and_get_nonzero_scalar_indices(std::span(i)); } } @@ -83,32 +67,21 @@ void MSM::transform_scalar_and_get_nonzero_scalar_indices(std::span::ThreadWorkUnits> - */ template std::vector::ThreadWorkUnits> MSM::get_work_units( std::span> scalars, std::vector>& msm_scalar_indices) noexcept @@ -117,7 +90,6 @@ std::vector::ThreadWorkUnits> MSM::get_work_units( const size_t num_msms = scalars.size(); msm_scalar_indices.resize(num_msms); for (size_t i = 0; i < num_msms; ++i) { - BB_ASSERT_LT(i, scalars.size()); transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]); } @@ -130,12 +102,9 @@ std::vector::ThreadWorkUnits> MSM::get_work_units( std::vector work_units(num_threads); const size_t work_per_thread = numeric::ceil_div(total_work, num_threads); - size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1)); + const size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1)); - // [(MSMs + T - 1) / T] * [T - 1] > MSMs - // T = 192 - // ([M + 191] / 192) * 193 > M - // only use a single work unit if we don't have enough work for every thread + // Only use a single work unit if we don't have enough work for every thread if (num_threads > total_work) { for (size_t i = 0; i < num_msms; ++i) { work_units[0].push_back(MSMWorkUnit{ @@ -150,31 +119,28 @@ std::vector::ThreadWorkUnits> MSM::get_work_units( size_t thread_accumulated_work = 0; size_t current_thread_idx = 0; for (size_t i = 0; i < num_msms; ++i) { - BB_ASSERT_DEBUG(i < msm_scalar_indices.size()); - size_t msm_work = msm_scalar_indices[i].size(); - size_t msm_size = msm_work; - while (msm_work > 0) { + size_t msm_work_remaining = msm_scalar_indices[i].size(); + const size_t initial_msm_work = msm_work_remaining; + + while (msm_work_remaining > 0) { + BB_ASSERT_LT(current_thread_idx, work_units.size()); + const size_t total_thread_work = (current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread; const size_t available_thread_work = total_thread_work - thread_accumulated_work; + const size_t work_to_assign = std::min(available_thread_work, msm_work_remaining); - if (available_thread_work >= msm_work) { - BB_ASSERT_LT(current_thread_idx, work_units.size()); - work_units[current_thread_idx].push_back(MSMWorkUnit{ - .batch_msm_index = i, - .start_index = msm_size - msm_work, - .size = msm_work, - }); - thread_accumulated_work += msm_work; - msm_work = 0; - } else { - BB_ASSERT_LT(current_thread_idx, work_units.size()); - work_units[current_thread_idx].push_back(MSMWorkUnit{ - .batch_msm_index = i, - .start_index = msm_size - msm_work, - .size = available_thread_work, - }); - msm_work -= available_thread_work; + work_units[current_thread_idx].push_back(MSMWorkUnit{ + .batch_msm_index = i, + .start_index = initial_msm_work - msm_work_remaining, + .size = work_to_assign, + }); + + thread_accumulated_work += work_to_assign; + msm_work_remaining -= work_to_assign; + + // Move to next thread if current thread is full + if (thread_accumulated_work >= total_thread_work) { current_thread_idx++; thread_accumulated_work = 0; } @@ -184,142 +150,91 @@ std::vector::ThreadWorkUnits> MSM::get_work_units( } /** - * @brief Given a scalar that is *NOT* in Montgomery form, extract a `slice_size`-bit chunk - * @brief At round i, we extract `slice_size * (i-1)` to `slice_sice * i` most significant bits. + * @brief Extract a slice of bits from a scalar for Pippenger bucket assignment + * @details Extracts bits [lo_bit, hi_bit) from the scalar's raw limb representation. + * The scalar must already be converted out of Montgomery form. + * + * IMPORTANT RESTRICTIONS (optimized for Pippenger's specific usage pattern): + * - slice_size must be <= 32 bits (returns uint32_t) + * - The bit range must span at most 2 limbs (satisfied when slice_size <= 64) + * - hi_bit must be < 256 to avoid out-of-bounds access (satisfied since hi_bit <= NUM_BITS_IN_FIELD < 256) * - * @tparam Curve - * @param scalar - * @param round - * @param normal_slice_size - * @return uint32_t + * @param scalar The scalar field element (must be in non-Montgomery form) + * @param round The current Pippenger round (0 = most significant bits) + * @param slice_size Number of bits per slice + * @return uint32_t The bucket index for this round */ template uint32_t MSM::get_scalar_slice(const typename Curve::ScalarField& scalar, size_t round, size_t slice_size) noexcept { + constexpr size_t LIMB_BITS = 64; + size_t hi_bit = NUM_BITS_IN_FIELD - (round * slice_size); - // todo remove - bool last_slice = hi_bit < slice_size; - size_t target_slice_size = last_slice ? hi_bit : slice_size; - size_t lo_bit = last_slice ? 0 : hi_bit - slice_size; - size_t start_limb = lo_bit / 64; - size_t end_limb = hi_bit / 64; - size_t lo_slice_offset = lo_bit & 63; - size_t lo_slice_bits = std::min(target_slice_size, 64 - lo_slice_offset); - size_t hi_slice_bits = target_slice_size - lo_slice_bits; - size_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((static_cast(1) << lo_slice_bits) - 1); - size_t hi_slice = (scalar.data[end_limb] & ((static_cast(1) << hi_slice_bits) - 1)); - - uint32_t lo = static_cast(lo_slice); - uint32_t hi = static_cast(hi_slice); - - uint32_t result = lo + (hi << lo_slice_bits); - return result; + size_t lo_bit = (hi_bit < slice_size) ? 0 : hi_bit - slice_size; + + BB_ASSERT_DEBUG(lo_bit < hi_bit); + BB_ASSERT_DEBUG(hi_bit <= NUM_BITS_IN_FIELD); // Ensures hi_bit < 256, so end_limb <= 3 + + size_t start_limb = lo_bit / LIMB_BITS; + size_t end_limb = hi_bit / LIMB_BITS; + size_t lo_slice_offset = lo_bit & (LIMB_BITS - 1); + size_t actual_slice_size = hi_bit - lo_bit; + size_t lo_slice_bits = + (LIMB_BITS - lo_slice_offset < actual_slice_size) ? (LIMB_BITS - lo_slice_offset) : actual_slice_size; + size_t hi_slice_bits = actual_slice_size - lo_slice_bits; + + uint64_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((1ULL << lo_slice_bits) - 1); + uint64_t hi_slice = (start_limb != end_limb) ? (scalar.data[end_limb] & ((1ULL << hi_slice_bits) - 1)) : 0; + + return static_cast(lo_slice | (hi_slice << lo_slice_bits)); } -/** - * @brief For a given number of points, compute the optimal Pippenger bucket size - * - * @tparam Curve - * @param num_points - * @return constexpr size_t - */ -template size_t MSM::get_optimal_log_num_buckets(const size_t num_points) noexcept +template uint32_t MSM::get_optimal_log_num_buckets(const size_t num_points) noexcept { - // We do 2 group operations per bucket, and they are full 3D Jacobian adds which are ~2x more than an affine add - constexpr size_t COST_OF_BUCKET_OP_RELATIVE_TO_POINT = 5; - size_t cached_cost = static_cast(-1); - size_t target_bit_slice = 0; - for (size_t bit_slice = 1; bit_slice < 20; ++bit_slice) { - const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bit_slice); - const size_t num_buckets = 1 << bit_slice; - const size_t addition_cost = num_rounds * num_points; - const size_t bucket_cost = num_rounds * num_buckets * COST_OF_BUCKET_OP_RELATIVE_TO_POINT; - const size_t total_cost = addition_cost + bucket_cost; - if (total_cost < cached_cost) { - cached_cost = total_cost; - target_bit_slice = bit_slice; + // Cost model: total_cost = num_rounds * (num_points + num_buckets * BUCKET_ACCUMULATION_COST) + auto compute_cost = [&](uint32_t bits) { + size_t rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, static_cast(bits)); + size_t buckets = size_t{ 1 } << bits; + return rounds * (num_points + buckets * BUCKET_ACCUMULATION_COST); + }; + + uint32_t best_bits = 1; + size_t best_cost = compute_cost(1); + for (uint32_t bits = 2; bits < MAX_SLICE_BITS; ++bits) { + size_t cost = compute_cost(bits); + if (cost < best_cost) { + best_cost = cost; + best_bits = bits; } } - return target_bit_slice; + return best_bits; } -/** - * @brief Given a number of points and an optimal bucket size, should we use the affine trick? - * - * @tparam Curve - * @param num_points - * @param num_buckets - * @return true - * @return false - */ template bool MSM::use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept { - if (num_points < 128) { + if (num_points < AFFINE_TRICK_THRESHOLD) { return false; } // Affine trick requires log(N) modular inversions per Pippenger round. - // It saves NUM_POINTS * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION field muls - // It also saves NUM_BUCKETS * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 field muls - // due to all our buckets having Z=1 if we use the affine trick - - // COST_OF_INVERSION cost: - // Requires NUM_BITS_IN_FIELD sqarings - // We use 4-bit windows = ((NUM_BITS_IN_FIELD + 3) / 4) multiplications - // Computing 4-bit window table requires 14 muls - constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + 14; - constexpr size_t COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION = 5; - constexpr size_t EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 = 5; - - double num_points_f = static_cast(num_points); - double log2_num_points_f = log2(num_points_f); - - size_t group_op_cost_saving_per_round = (num_points * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION) + - (num_buckets * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1); - double inversion_cost_per_round = log2_num_points_f * static_cast(COST_OF_INVERSION); - - return static_cast(group_op_cost_saving_per_round) > inversion_cost_per_round; + // It saves num_points * AFFINE_TRICK_SAVINGS_PER_OP field muls, plus + // num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY field muls (buckets have Z=1 with affine trick) + + // Cost of modular inversion via exponentiation: + // - NUM_BITS_IN_FIELD squarings + // - (NUM_BITS_IN_FIELD + 3) / 4 multiplications (4-bit windows) + // - INVERSION_TABLE_COST multiplications for lookup table + constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + INVERSION_TABLE_COST; + + double log2_num_points = log2(static_cast(num_points)); + size_t savings_per_round = (num_points * AFFINE_TRICK_SAVINGS_PER_OP) + (num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY); + double inversion_cost_per_round = log2_num_points * static_cast(COST_OF_INVERSION); + + return static_cast(savings_per_round) > inversion_cost_per_round; } -/** - * @brief adds a bunch of points together using affine addition formulae. - * @details Paradoxically, the affine formula is crazy efficient if you have a lot of independent point additions to - * perform. Affine formula: - * - * \lambda = (y_2 - y_1) / (x_2 - x_1) - * x_3 = \lambda^2 - (x_2 + x_1) - * y_3 = \lambda*(x_1 - x_3) - y_1 - * - * Traditionally, we avoid affine formulae like the plague, because computing lambda requires a modular inverse, - * which is outrageously expensive. - * - * However! We can use Montgomery's batch inversion technique to amortise the cost of the inversion to ~0. - * - * The way batch inversion works is as follows. Let's say you want to compute \{ 1/x_1, 1/x_2, ..., 1/x_n \} - * The trick is to compute the product x_1x_2...x_n , whilst storing all of the temporary products. - * i.e. we have an array A = [x_1, x_1x_2, ..., x_1x_2...x_n] - * We then compute a single inverse: I = 1 / x_1x_2...x_n - * Finally, we can use our accumulated products, to quotient out individual inverses. - * We can get an individual inverse at index i, by computing I.A_{i-1}.(x_nx_n-1...x_i+1) - * The last product term we can compute on-the-fly, as it grows by one element for each additional inverse that we - * require. - * - * TLDR: amortized cost of a modular inverse is 3 field multiplications per inverse. - * Which means we can compute a point addition with SIX field multiplications in total. - * The traditional Jacobian-coordinate formula requires 11. - * - * There is a catch though - we need large sequences of independent point additions! - * i.e. the output from one point addition in the sequence is NOT an input to any other point addition in the - * sequence. - * - * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated - * @tparam Curve - * @param points points to be added pairwise; result is stored in the latter half of the array - * @param num_points - * @param scratch_space coordinate field scratch space needed for batched inversion - **/ template void MSM::add_affine_points(typename Curve::AffineElement* points, const size_t num_points, @@ -328,12 +243,14 @@ void MSM::add_affine_points(typename Curve::AffineElement* points, using Fq = typename Curve::BaseField; Fq batch_inversion_accumulator = Fq::one(); + // Forward pass: prepare batch inversion inputs. + // We reuse points[i+1] storage: .x stores (x2-x1), .y stores (y2-y1)*accumulator for (size_t i = 0; i < num_points; i += 2) { - scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 - points[i + 1].x -= points[i].x; // x2 - x1 - points[i + 1].y -= points[i].y; // y2 - y1 + scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 (needed later for x3) + points[i + 1].x -= points[i].x; // x2 - x1 (denominator for lambda) + points[i + 1].y -= points[i].y; // y2 - y1 (numerator for lambda) points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (points[i + 1].x); + batch_inversion_accumulator *= (points[i + 1].x); // accumulate denominators } if (batch_inversion_accumulator == 0) { // prefer abort to throw for code that might emit from multiple threads @@ -342,421 +259,215 @@ void MSM::add_affine_points(typename Curve::AffineElement* points, batch_inversion_accumulator = batch_inversion_accumulator.invert(); } - // Iterate backwards through the points, comnputing pairwise affine additions; addition results are stored in the - // latter half of the array + // Backward pass: compute additions using batch inversion results. + // Reusing points[i+1] storage: .y becomes lambda, .x becomes lambda^2. + // Results are written to the top half of points array: points[(i+num_points)/2]. + // Loop terminates when i underflows (becomes > num_points for unsigned). for (size_t i = (num_points)-2; i < num_points; i -= 2) { - points[i + 1].y *= batch_inversion_accumulator; // update accumulator - batch_inversion_accumulator *= points[i + 1].x; - points[i + 1].x = points[i + 1].y.sqr(); - points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2 - // - x1 - // Memory bandwidth is a bit of a bottleneck here. - // There's probably a more elegant way of structuring our data so we don't need to do all of this - // prefetching + points[i + 1].y *= batch_inversion_accumulator; // .y now holds lambda = (y2-y1)/(x2-x1) + batch_inversion_accumulator *= points[i + 1].x; // restore accumulator for next iteration + points[i + 1].x = points[i + 1].y.sqr(); // .x now holds lambda^2 + points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda^2 - x2 - x1 + // Output addresses jump non-sequentially: points[(i+n)>>1] defeats hardware prefetcher. + // Fetching 2 iterations ahead ensures data arrives before needed. if (i >= 2) { __builtin_prefetch(points + i - 2); __builtin_prefetch(points + i - 1); __builtin_prefetch(points + ((i + num_points - 2) >> 1)); __builtin_prefetch(scratch_space + ((i - 2) >> 1)); } - points[i].x -= points[(i + num_points) >> 1].x; - points[i].x *= points[i + 1].y; - points[(i + num_points) >> 1].y = points[i].x - points[i].y; + // Compute y3 = lambda * (x1 - x3) - y1, reusing points[i].x as temp storage + points[i].x -= points[(i + num_points) >> 1].x; // x1 - x3 + points[i].x *= points[i + 1].y; // lambda * (x1 - x3) + points[(i + num_points) >> 1].y = points[i].x - points[i].y; // y3 = lambda*(x1-x3) - y1 } } -/** - * @brief Top-level Pippenger algorithm where number of points is small and we are not using the Affine trick - * - * @tparam Curve - * @param msm_data - * @return Curve::AffineElement - */ template -typename Curve::Element MSM::small_pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept +typename Curve::Element MSM::jacobian_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept { - std::span& nonzero_scalar_indices = msm_data.scalar_indices; - const size_t size = nonzero_scalar_indices.size(); - const size_t bits_per_slice = get_optimal_log_num_buckets(size); - const size_t num_buckets = 1 << bits_per_slice; - JacobianBucketAccumulators bucket_data = JacobianBucketAccumulators(num_buckets); - Element round_output = Curve::Group::point_at_infinity; + const size_t size = msm_data.scalar_indices.size(); + const uint32_t bits_per_slice = get_optimal_log_num_buckets(size); + const size_t num_buckets = size_t{ 1 } << bits_per_slice; + const uint32_t num_rounds = static_cast((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice); + const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice; + + JacobianBucketAccumulators bucket_data(num_buckets); + Element msm_result = Curve::Group::point_at_infinity; + + for (uint32_t round = 0; round < num_rounds; ++round) { + // Populate buckets using Jacobian accumulation + for (size_t i = 0; i < size; ++i) { + uint32_t idx = msm_data.scalar_indices[i]; + uint32_t bucket = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice); + if (bucket > 0) { + if (bucket_data.bucket_exists.get(bucket)) { + bucket_data.buckets[bucket] += msm_data.points[idx]; + } else { + bucket_data.buckets[bucket] = msm_data.points[idx]; + bucket_data.bucket_exists.set(bucket, true); + } + } + } - const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice); + // Reduce buckets and accumulate into result + Element bucket_result = accumulate_buckets(bucket_data); + bucket_data.bucket_exists.clear(); - for (size_t i = 0; i < num_rounds; ++i) { - round_output = evaluate_small_pippenger_round(msm_data, i, bucket_data, round_output, bits_per_slice); + uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice; + for (uint32_t i = 0; i < num_doublings; ++i) { + msm_result.self_dbl(); + } + msm_result += bucket_result; } - return round_output; + return msm_result; } -/** - * @brief Top-level Pippenger algorithm - * - * @tparam Curve - * @param msm_data - * @return Curve::AffineElement - */ template -typename Curve::Element MSM::pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept +typename Curve::Element MSM::affine_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept { - const size_t msm_size = msm_data.scalar_indices.size(); - const size_t bits_per_slice = get_optimal_log_num_buckets(msm_size); - const size_t num_buckets = 1 << bits_per_slice; + const size_t num_points = msm_data.scalar_indices.size(); + const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points); + const size_t num_buckets = size_t{ 1 } << bits_per_slice; - if (!use_affine_trick(msm_size, num_buckets)) { - return small_pippenger_low_memory_with_transformed_scalars(msm_data); + if (!use_affine_trick(num_points, num_buckets)) { + return jacobian_pippenger_with_transformed_scalars(msm_data); } - // TODO(https://github.com/AztecProtocol/barretenberg/issues/1452): Consider allowing this memory to persist rather - // than allocating/deallocating on every execution. - AffineAdditionData affine_data = AffineAdditionData(); - BucketAccumulators bucket_data = BucketAccumulators(num_buckets); - Element round_output = Curve::Group::point_at_infinity; + const uint32_t num_rounds = static_cast((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice); + const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice; - const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice); - for (size_t i = 0; i < num_rounds; ++i) { - round_output = evaluate_pippenger_round(msm_data, i, affine_data, bucket_data, round_output, bits_per_slice); - } + // Per-call allocation for WASM compatibility (thread_local causes issues in WASM) + AffineAdditionData affine_data; + BucketAccumulators bucket_data(num_buckets); - return (round_output); -} + Element msm_result = Curve::Group::point_at_infinity; -/** - * @brief Evaluate a single Pippenger round when we do not use the Affine trick - * - * @tparam Curve - * @param msm_data - * @param round_index - * @param bucket_data - * @param previous_round_output - * @param bits_per_slice - * @return Curve::Element - */ -template -typename Curve::Element MSM::evaluate_small_pippenger_round(MSMData& msm_data, - const size_t round_index, - MSM::JacobianBucketAccumulators& bucket_data, - typename Curve::Element previous_round_output, - const size_t bits_per_slice) noexcept -{ - std::span& nonzero_scalar_indices = msm_data.scalar_indices; - std::span& scalars = msm_data.scalars; - std::span& points = msm_data.points; - - const size_t size = nonzero_scalar_indices.size(); - for (size_t i = 0; i < size; ++i) { - BB_ASSERT_DEBUG(nonzero_scalar_indices[i] < scalars.size()); - uint32_t bucket_index = get_scalar_slice(scalars[nonzero_scalar_indices[i]], round_index, bits_per_slice); - BB_ASSERT_DEBUG(bucket_index < static_cast(1 << bits_per_slice)); - if (bucket_index > 0) { - // do this check because we do not reset bucket_data.buckets after each round - // (i.e. not neccessarily at infinity) - if (bucket_data.bucket_exists.get(bucket_index)) { - bucket_data.buckets[bucket_index] += points[nonzero_scalar_indices[i]]; - } else { - bucket_data.buckets[bucket_index] = points[nonzero_scalar_indices[i]]; - bucket_data.bucket_exists.set(bucket_index, true); + for (uint32_t round = 0; round < num_rounds; ++round) { + // Build point schedule for this round + { + for (size_t i = 0; i < num_points; ++i) { + uint32_t idx = msm_data.scalar_indices[i]; + uint32_t bucket_idx = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice); + msm_data.point_schedule[i] = PointScheduleEntry::create(idx, bucket_idx).data; } } - } - Element round_output; - round_output.self_set_infinity(); - round_output = accumulate_buckets(bucket_data); - bucket_data.bucket_exists.clear(); - Element result = previous_round_output; - const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice); - size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0)) - ? NUM_BITS_IN_FIELD % bits_per_slice - : bits_per_slice; - for (size_t i = 0; i < num_doublings; ++i) { - result.self_dbl(); - } - result += round_output; - return result; -} - -/** - * @brief Evaluate a single Pippenger round where we use the affine trick - * - * @tparam Curve - * @param msm_data - * @param round_index - * @param affine_data - * @param bucket_data - * @param previous_round_output - * @param bits_per_slice - * @return Curve::Element - */ -template -typename Curve::Element MSM::evaluate_pippenger_round(MSMData& msm_data, - const size_t round_index, - MSM::AffineAdditionData& affine_data, - MSM::BucketAccumulators& bucket_data, - typename Curve::Element previous_round_output, - const size_t bits_per_slice) noexcept -{ - std::span& scalar_indices = msm_data.scalar_indices; // indices of nonzero scalars - std::span& scalars = msm_data.scalars; - std::span& points = msm_data.points; - std::span& round_schedule = msm_data.point_schedule; - const size_t size = scalar_indices.size(); - - // Construct a "round schedule". Each entry describes: - // 1. low 32 bits: which bucket index do we add the point into? (bucket index = slice value) - // 2. high 32 bits: which point index do we source the point from? - for (size_t i = 0; i < size; ++i) { - BB_ASSERT_DEBUG(scalar_indices[i] < scalars.size()); - round_schedule[i] = get_scalar_slice(scalars[scalar_indices[i]], round_index, bits_per_slice); - round_schedule[i] += (static_cast(scalar_indices[i]) << 32ULL); - } - // Sort our point schedules based on their bucket values. Reduces memory throughput in next step of algo - const size_t num_zero_entries = scalar_multiplication::process_buckets_count_zero_entries( - &round_schedule[0], size, static_cast(bits_per_slice)); - BB_ASSERT_DEBUG(num_zero_entries <= size); - const size_t round_size = size - num_zero_entries; - - Element round_output; - round_output.self_set_infinity(); - - if (round_size > 0) { - std::span point_schedule(&round_schedule[num_zero_entries], round_size); - // Iterate through our point schedule and add points into corresponding buckets - consume_point_schedule(point_schedule, points, affine_data, bucket_data, 0, 0); - round_output = accumulate_buckets(bucket_data); - bucket_data.bucket_exists.clear(); - } + // Sort by bucket and count zero-bucket entries + size_t num_zero_bucket_entries = + sort_point_schedule_and_count_zero_buckets(&msm_data.point_schedule[0], num_points, bits_per_slice); + size_t round_size = num_points - num_zero_bucket_entries; + + // Accumulate points into buckets + Element bucket_result = Curve::Group::point_at_infinity; + if (round_size > 0) { + std::span schedule(&msm_data.point_schedule[num_zero_bucket_entries], round_size); + batch_accumulate_points_into_buckets(schedule, msm_data.points, affine_data, bucket_data); + bucket_result = accumulate_buckets(bucket_data); + bucket_data.bucket_exists.clear(); + } - Element result = previous_round_output; - const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice); - size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0)) - ? NUM_BITS_IN_FIELD % bits_per_slice - : bits_per_slice; - for (size_t i = 0; i < num_doublings; ++i) { - result.self_dbl(); + // Combine into running result + uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice; + for (uint32_t i = 0; i < num_doublings; ++i) { + msm_result.self_dbl(); + } + msm_result += bucket_result; } - result += round_output; - return result; + return msm_result; } -/** - * @brief Given a list of points and target buckets to add into, perform required group operations - * @details This algorithm uses exclusively affine group operations, using batch inversions to amortise costs - * - * @tparam Curve - * @param point_schedule - * @param points - * @param affine_data - * @param bucket_data - * @param num_input_points_processed - * @param num_queued_affine_points - */ template -void MSM::consume_point_schedule(std::span point_schedule, - std::span points, - MSM::AffineAdditionData& affine_data, - MSM::BucketAccumulators& bucket_data, - size_t num_input_points_processed, - size_t num_queued_affine_points) noexcept +void MSM::batch_accumulate_points_into_buckets(std::span point_schedule, + std::span points, + MSM::AffineAdditionData& affine_data, + MSM::BucketAccumulators& bucket_data) noexcept { - size_t point_it = num_input_points_processed; - size_t affine_input_it = num_queued_affine_points; - // N.B. points and point_schedule MAY HAVE DIFFERENT SIZES - // We source the number of actual points we work on from the point schedule - size_t num_points = point_schedule.size(); - auto& bucket_accumulator_exists = bucket_data.bucket_exists; - auto& affine_addition_scratch_space = affine_data.points_to_add; - auto& bucket_accumulators = bucket_data.buckets; - auto& affine_addition_output_bucket_destinations = affine_data.addition_result_bucket_destinations; - auto& scalar_scratch_space = affine_data.scalar_scratch_space; - auto& output_point_schedule = affine_data.addition_result_bucket_destinations; - AffineElement null_location{}; - // We do memory prefetching, `prefetch_max` ensures we do not overflow our containers - size_t prefetch_max = (num_points - 32); - if (num_points < 32) { - prefetch_max = 0; - } - size_t end = num_points - 1; - if (num_points == 0) { - end = 0; - } + if (point_schedule.empty()) { + return; + } + + size_t point_it = 0; + size_t scratch_it = 0; + const size_t num_points = point_schedule.size(); + const size_t prefetch_max = (num_points >= PREFETCH_LOOKAHEAD) ? (num_points - PREFETCH_LOOKAHEAD) : 0; + const size_t last_index = num_points - 1; + + // Iterative loop - continues until all points processed and no work remains in scratch space + while (point_it < num_points || scratch_it != 0) { + // Step 1: Fill scratch space with up to BATCH_SIZE/2 independent additions + while (((scratch_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < last_index)) { + // Prefetch points we'll need soon (every PREFETCH_INTERVAL iterations) + if ((point_it < prefetch_max) && ((point_it & PREFETCH_INTERVAL_MASK) == 0)) { + for (size_t i = PREFETCH_LOOKAHEAD / 2; i < PREFETCH_LOOKAHEAD; ++i) { + PointScheduleEntry entry{ point_schedule[point_it + i] }; + __builtin_prefetch(&points[entry.point_index()]); + } + } + + PointScheduleEntry lhs{ point_schedule[point_it] }; + PointScheduleEntry rhs{ point_schedule[point_it + 1] }; + + process_bucket_pair(lhs.bucket_index(), + rhs.bucket_index(), + &points[lhs.point_index()], + &points[rhs.point_index()], + affine_data, + bucket_data, + scratch_it, + point_it); + } - // Step 1: Fill up `affine_addition_scratch_space` with up to AffineAdditionData::BATCH_SIZE/2 independent additions - while (((affine_input_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < end)) { + // Handle the last point (odd count case) - separate to avoid bounds check on point_schedule[point_it + 1] + if (point_it == last_index) { + PointScheduleEntry last{ point_schedule[point_it] }; + process_single_point( + last.bucket_index(), &points[last.point_index()], affine_data, bucket_data, scratch_it, point_it); + } - // we prefetchin' - if ((point_it < prefetch_max) && ((point_it & 0x0f) == 0)) { - for (size_t i = 16; i < 32; ++i) { - __builtin_prefetch(&points[(point_schedule[point_it + i] >> 32ULL)]); - } + // Compute independent additions using Montgomery's batch inversion trick + size_t num_points_to_add = scratch_it; + if (num_points_to_add >= 2) { + add_affine_points( + affine_data.points_to_add.data(), num_points_to_add, affine_data.inversion_scratch_space.data()); } - // We do some branchless programming here to minimize instruction pipeline flushes - // TODO(@zac-williamson, cc @ludamad) check these ternary operators are not branching! -> (ludamad: they don't, - // but its not clear that the conditional move is fundamentally less expensive) - // We are iterating through our points and - // can come across the following scenarios: 1: The next 2 points in `point_schedule` belong to the *same* bucket - // (happy path - can put both points into affine_addition_scratch_space) - // 2: The next 2 points have different bucket destinations AND point_schedule[point_it].bucket contains a point - // (happyish path - we can put points[lhs_schedule] and buckets[lhs_bucket] into - // affine_addition_scratch_space) - // 3: The next 2 points have different bucket destionations AND point_schedule[point_it].bucket is empty - // We cache points[lhs_schedule] into buckets[lhs_bucket] - // We iterate `point_it` by 2 (case 1), or by 1 (case 2 or 3). The number of points we add into - // `affine_addition_scratch_space` is 2 (case 1 or 2) or 0 (case 3). - uint64_t lhs_schedule = point_schedule[point_it]; - uint64_t rhs_schedule = point_schedule[point_it + 1]; - size_t lhs_bucket = static_cast(lhs_schedule) & 0xFFFFFFFF; - size_t rhs_bucket = static_cast(rhs_schedule) & 0xFFFFFFFF; - size_t lhs_point = static_cast(lhs_schedule >> 32); - size_t rhs_point = static_cast(rhs_schedule >> 32); - - bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket); - bool buckets_match = lhs_bucket == rhs_bucket; - bool do_affine_add = buckets_match || has_bucket_accumulator; - - const AffineElement* lhs_source = &points[lhs_point]; - const AffineElement* rhs_source = buckets_match ? &points[rhs_point] : &bucket_accumulators[lhs_bucket]; - - // either two points are set to be added (point to point or point into bucket accumulator), or lhs is stored in - // the bucket and rhs is temporarily ignored - AffineElement* lhs_destination = - do_affine_add ? &affine_addition_scratch_space[affine_input_it] : &bucket_accumulators[lhs_bucket]; - AffineElement* rhs_destination = - do_affine_add ? &affine_addition_scratch_space[affine_input_it + 1] : &null_location; - - // if performing an affine add, set the destination bucket corresponding to the addition result - uint64_t& source_bucket_destination = affine_addition_output_bucket_destinations[affine_input_it >> 1]; - source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination; - - // unconditional swap. No if statements here. - *lhs_destination = *lhs_source; - *rhs_destination = *rhs_source; - - // indicate whether bucket_accumulators[lhs_bucket] will contain a point after this iteration - bucket_accumulator_exists.set( - lhs_bucket, - (has_bucket_accumulator && buckets_match) || /* bucket has an accum and its not being used in current add */ - !do_affine_add); /* lhs point is cached into the bucket */ - - affine_input_it += do_affine_add ? 2 : 0; - point_it += (do_affine_add && buckets_match) ? 2 : 1; - } - // We have to handle the last point as an edge case so that we dont overflow the bounds of `point_schedule`. If the - // bucket accumulator exists, we add the point to it, otherwise the point simply becomes the bucket accumulator. - if (point_it == num_points - 1) { - uint64_t lhs_schedule = point_schedule[point_it]; - size_t lhs_bucket = static_cast(lhs_schedule) & 0xFFFFFFFF; - size_t lhs_point = static_cast(lhs_schedule >> 32); - bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket); - - if (has_bucket_accumulator) { // point is added to its bucket accumulator - affine_addition_scratch_space[affine_input_it] = points[lhs_point]; - affine_addition_scratch_space[affine_input_it + 1] = bucket_accumulators[lhs_bucket]; - bucket_accumulator_exists.set(lhs_bucket, false); - affine_addition_output_bucket_destinations[affine_input_it >> 1] = lhs_bucket; - affine_input_it += 2; - point_it += 1; - } else { // otherwise, cache the point into the bucket - BB_ASSERT_DEBUG(lhs_point < points.size()); - bucket_accumulators[lhs_bucket] = points[lhs_point]; - bucket_accumulator_exists.set(lhs_bucket, true); - point_it += 1; + // add_affine_points stores results in the top-half of scratch space + AffineElement* affine_output = affine_data.points_to_add.data() + (num_points_to_add / 2); + + // Recirculate addition outputs back into scratch space or bucket accumulators + size_t new_scratch_it = 0; + size_t output_it = 0; + size_t num_outputs = num_points_to_add / 2; + + while ((num_outputs > 1) && (output_it + 1 < num_outputs)) { + uint32_t lhs_bucket = affine_data.addition_result_bucket_destinations[output_it]; + uint32_t rhs_bucket = affine_data.addition_result_bucket_destinations[output_it + 1]; + + process_bucket_pair(lhs_bucket, + rhs_bucket, + &affine_output[output_it], + &affine_output[output_it + 1], + affine_data, + bucket_data, + new_scratch_it, + output_it); } - } - // Now that we have populated `affine_addition_scratch_space`, - // compute `num_affine_points_to_add` independent additions using the Affine trick - size_t num_affine_points_to_add = affine_input_it; - if (num_affine_points_to_add >= 2) { - add_affine_points(&affine_addition_scratch_space[0], num_affine_points_to_add, &scalar_scratch_space[0]); - } - // `add_affine_points` stores the result in the top-half of the used scratch space - G1* affine_output = &affine_addition_scratch_space[0] + (num_affine_points_to_add / 2); - - // Process the addition outputs. - // We either need to feed the addition outputs back into affine_addition_scratch_space for more addition operations. - // Or, if there are no more additions for a bucket, we store the addition output in a bucket accumulator. - size_t new_scratch_space_it = 0; - size_t affine_output_it = 0; - size_t num_affine_output_points = num_affine_points_to_add / 2; - // This algorithm is equivalent to the one we used to populate `affine_addition_scratch_space` from the point - // schedule, however here we source points from a different location (the addition results) - while ((affine_output_it < (num_affine_output_points - 1)) && (num_affine_output_points > 0)) { - size_t lhs_bucket = static_cast(affine_addition_output_bucket_destinations[affine_output_it]); - size_t rhs_bucket = static_cast(affine_addition_output_bucket_destinations[affine_output_it + 1]); - BB_ASSERT_DEBUG(lhs_bucket < bucket_accumulator_exists.size()); - - bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket); - bool buckets_match = (lhs_bucket == rhs_bucket); - bool do_affine_add = buckets_match || has_bucket_accumulator; - - const AffineElement* lhs_source = &affine_output[affine_output_it]; - const AffineElement* rhs_source = - buckets_match ? &affine_output[affine_output_it + 1] : &bucket_accumulators[lhs_bucket]; - - AffineElement* lhs_destination = - do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it] : &bucket_accumulators[lhs_bucket]; - AffineElement* rhs_destination = - do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it + 1] : &null_location; - - uint64_t& source_bucket_destination = output_point_schedule[new_scratch_space_it >> 1]; - source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination; - - *lhs_destination = *lhs_source; - *rhs_destination = *rhs_source; - - bucket_accumulator_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add); - new_scratch_space_it += do_affine_add ? 2 : 0; - affine_output_it += (do_affine_add && buckets_match) ? 2 : 1; - } - // perform final iteration as edge case so we don't overflow `affine_addition_output_bucket_destinations` - if (affine_output_it == (num_affine_output_points - 1)) { - - size_t lhs_bucket = static_cast(affine_addition_output_bucket_destinations[affine_output_it]); - - bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket); - if (has_bucket_accumulator) { - BB_ASSERT_DEBUG(new_scratch_space_it + 1 < affine_addition_scratch_space.size()); - BB_ASSERT_DEBUG(lhs_bucket < bucket_accumulators.size()); - BB_ASSERT_DEBUG((new_scratch_space_it >> 1) < output_point_schedule.size()); - affine_addition_scratch_space[new_scratch_space_it] = affine_output[affine_output_it]; - affine_addition_scratch_space[new_scratch_space_it + 1] = bucket_accumulators[lhs_bucket]; - bucket_accumulator_exists.set(lhs_bucket, false); - output_point_schedule[new_scratch_space_it >> 1] = lhs_bucket; - new_scratch_space_it += 2; - affine_output_it += 1; - } else { - bucket_accumulators[lhs_bucket] = affine_output[affine_output_it]; - bucket_accumulator_exists.set(lhs_bucket, true); - affine_output_it += 1; + // Handle the last output (odd count case) + if (num_outputs > 0 && output_it == num_outputs - 1) { + uint32_t bucket = affine_data.addition_result_bucket_destinations[output_it]; + process_single_point( + bucket, &affine_output[output_it], affine_data, bucket_data, new_scratch_it, output_it); } - } - // If we have not finished iterating over the point schedule, - // OR we have affine additions to perform in the scratch space, continue - if (point_it < num_points || new_scratch_space_it != 0) { - consume_point_schedule(point_schedule, points, affine_data, bucket_data, point_it, new_scratch_space_it); + // Continue with recirculated points + scratch_it = new_scratch_it; } } -/** - * @brief Compute multiple multi-scalar multiplications. - * @details If we need to perform multiple MSMs, this method will be more efficient than calling `msm` repeatedly - * This is because this method will be able to dispatch equal work to all threads without splitting the input - * msms up so much. - * The Pippenger algorithm runtime is O(N/log(N)) so there will be slight gains as each inner-thread MSM will - * have a larger N. - * The input scalars are not const because the algorithm converts them out of Montgomery form and then back. - * - * @tparam Curve - * @param points - * @param scalars - * @return std::vector - */ template std::vector MSM::batch_multi_scalar_mul( std::span> points, @@ -772,32 +483,29 @@ std::vector MSM::batch_multi_scalar_mul( std::vector>> thread_msm_results(num_cpus); BB_ASSERT_EQ(thread_work_units.size(), num_cpus); + // Select Pippenger implementation once (hoisting branch outside hot loop) + // Jacobian: safe, handles edge cases | Affine: faster, assumes linearly independent points + auto pippenger_impl = + handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars; + // Once we have our work units, each thread can independently evaluate its assigned msms parallel_for(num_cpus, [&](size_t thread_idx) { if (!thread_work_units[thread_idx].empty()) { const std::vector& msms = thread_work_units[thread_idx]; std::vector>& msm_results = thread_msm_results[thread_idx]; + msm_results.reserve(msms.size()); + + // Point schedule buffer for this thread - avoids per-work-unit heap allocation + std::vector point_schedule_buffer; + for (const MSMWorkUnit& msm : msms) { - std::span work_scalars = scalars[msm.batch_msm_index]; - std::span work_points = points[msm.batch_msm_index]; - std::span work_indices = - std::span{ &msm_scalar_indices[msm.batch_msm_index][msm.start_index], msm.size }; - std::vector point_schedule(msm.size); - MSMData msm_data(work_scalars, work_points, work_indices, std::span(point_schedule)); - Element msm_result = Curve::Group::point_at_infinity; - constexpr size_t SINGLE_MUL_THRESHOLD = 16; - if (msm.size < SINGLE_MUL_THRESHOLD) { - msm_result = small_mul(work_scalars, work_points, msm_data.scalar_indices, msm.size); - } else { - // Our non-affine method implicitly handles cases where Weierstrass edge cases may occur - // Note: not as fast! use unsafe version if you know all input base points are linearly independent - if (handle_edge_cases) { - msm_result = small_pippenger_low_memory_with_transformed_scalars(msm_data); - } else { - msm_result = pippenger_low_memory_with_transformed_scalars(msm_data); - } - } - msm_results.push_back(std::make_pair(msm_result, msm.batch_msm_index)); + point_schedule_buffer.resize(msm.size); + MSMData msm_data = + MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm); + Element msm_result = + (msm.size < PIPPENGER_THRESHOLD) ? small_mul(msm_data) : pippenger_impl(msm_data); + + msm_results.emplace_back(msm_result, msm.batch_msm_index); } } }); @@ -805,61 +513,49 @@ std::vector MSM::batch_multi_scalar_mul( // Accumulate results. This part needs to be single threaded, but amount of work done here should be small // TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part // will be painful. - std::vector results(num_msms); - for (Element& ele : results) { - ele.self_set_infinity(); - } + std::vector results(num_msms, Curve::Group::point_at_infinity); for (const auto& single_thread_msm_results : thread_msm_results) { - for (const std::pair& result : single_thread_msm_results) { - results[result.second] += result.first; + for (const auto& [element, index] : single_thread_msm_results) { + results[index] += element; } } - Element::batch_normalize(&results[0], num_msms); + Element::batch_normalize(results.data(), num_msms); - std::vector affine_results; - for (const auto& ele : results) { - affine_results.emplace_back(AffineElement(ele.x, ele.y)); - } - - // Convert our scalars back into Montgomery form so they remain unchanged - for (auto& msm_scalars : scalars) { - parallel_for_range(msm_scalars.size(), [&](size_t start, size_t end) { + // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective + for (auto& scalar_span : scalars) { + parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { - msm_scalars[i].self_to_montgomery_form(); + scalar_span[i].self_to_montgomery_form(); } }); } - return affine_results; + + return std::vector(results.begin(), results.end()); } -/** - * @brief Helper method to evaluate a single MSM. Internally calls `batch_multi_scalar_mul` - * - * @tparam Curve - * @param points - * @param _scalars - * @return Curve::AffineElement - */ template typename Curve::AffineElement MSM::msm(std::span points, - PolynomialSpan _scalars, + PolynomialSpan scalars, bool handle_edge_cases) noexcept { - if (_scalars.size() == 0) { + if (scalars.size() == 0) { return Curve::Group::affine_point_at_infinity; } - BB_ASSERT_GTE(points.size(), _scalars.start_index + _scalars.size()); - - // unfortnately we need to remove const on this data type to prevent duplicating _scalars (which is typically - // large) We need to convert `_scalars` out of montgomery form for the MSM. We then convert the scalars back - // into Montgomery form at the end of the algorithm. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - // TODO(https://github.com/AztecProtocol/barretenberg/issues/1449): handle const correctness. - ScalarField* scalars = const_cast(&_scalars[_scalars.start_index]); - - std::vector> pp{ points.subspan(_scalars.start_index) }; - std::vector> ss{ std::span(scalars, _scalars.size()) }; - AffineElement result = batch_multi_scalar_mul(pp, ss, handle_edge_cases)[0]; - return result; + const size_t num_scalars = scalars.size(); + BB_ASSERT_GTE(points.size(), scalars.start_index + num_scalars); + + // const_cast is safe: we convert from Montgomery, compute, then convert back. + // Scalars are unchanged from the caller's perspective. + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + ScalarField* scalar_ptr = const_cast(&scalars[scalars.start_index]); + std::span scalar_span(scalar_ptr, num_scalars); + + // Wrap into a size-1 batch and delegate to the general method that properly handles multi-threading + std::array, 1> points_batch{ points.subspan(scalars.start_index) }; + std::array, 1> scalars_batch{ scalar_span }; + + auto results = batch_multi_scalar_mul(std::span(points_batch), std::span(scalars_batch), handle_edge_cases); + return results[0]; } template diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp index 448c61b99421..0f410bfe1a63 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp @@ -12,10 +12,8 @@ #include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp" #include "barretenberg/polynomials/polynomial.hpp" -#include "./process_buckets.hpp" -#include "./scalar_multiplication.hpp" - #include "./bitvector.hpp" +#include "./process_buckets.hpp" namespace bb::scalar_multiplication { template class MSM { @@ -25,15 +23,70 @@ template class MSM { using BaseField = typename Curve::BaseField; using AffineElement = typename Curve::AffineElement; - using G1 = AffineElement; static constexpr size_t NUM_BITS_IN_FIELD = ScalarField::modulus.get_msb() + 1; + // ======================= Algorithm Tuning Constants ======================= + // + // These constants control the behavior of the Pippenger MSM algorithm. + // They are empirically tuned for performance on typical hardware. + + // Below this threshold, use naive scalar multiplication instead of Pippenger + static constexpr size_t PIPPENGER_THRESHOLD = 16; + + // Below this threshold, the affine batch inversion trick is not beneficial + // (cost of inversions exceeds savings from cheaper affine additions) + static constexpr size_t AFFINE_TRICK_THRESHOLD = 128; + + // Maximum bits per scalar slice (2^20 = 1M buckets, far beyond practical use) + static constexpr size_t MAX_SLICE_BITS = 20; + + // Number of points to look ahead for memory prefetching + static constexpr size_t PREFETCH_LOOKAHEAD = 32; + + // Prefetch every N iterations (must be power of 2); mask is N-1 for efficient modulo + static constexpr size_t PREFETCH_INTERVAL = 16; + static constexpr size_t PREFETCH_INTERVAL_MASK = PREFETCH_INTERVAL - 1; + + // ======================= Cost Model Constants ======================= + // + // These constants define the relative costs of various operations, + // used to decide between algorithm variants. + + // Cost of bucket accumulation relative to a single point addition + // (2 Jacobian adds per bucket, each ~2.5x cost of affine add) + static constexpr size_t BUCKET_ACCUMULATION_COST = 5; + + // Field multiplications saved per group operation when using affine trick + static constexpr size_t AFFINE_TRICK_SAVINGS_PER_OP = 5; + + // Extra cost of Jacobian group operation when Z coordinate != 1 + static constexpr size_t JACOBIAN_Z_NOT_ONE_PENALTY = 5; + + // Cost of computing 4-bit lookup table for modular exponentiation (14 muls) + static constexpr size_t INVERSION_TABLE_COST = 14; + // =========================================================================== + + // Offset generator used in bucket reduction to probabilistically avoid incomplete-addition + // edge cases in the accumulator. Derived from domain-separated precomputed generators. + static const AffineElement& get_offset_generator() noexcept + { + static const AffineElement offset_generator = []() { + if constexpr (std::same_as) { + return get_precomputed_generators()[0]; + } else { + return get_precomputed_generators()[0]; + } + }(); + return offset_generator; + } + /** * @brief MSMWorkUnit describes an MSM that may be part of a larger MSM * @details For a multi-MSM where each MSM has a variable size, we want to split the MSMs up * such that every available thread has an equal amount of MSM work to perform. - * The actual MSM algorithm used is single-threaded. This is beneficial because we get better scaling. - * + * Each work unit is computed single-threaded; a single MSM may be split across + * threads and reduced. This approach yields better scaling than thread-parallel + * bucket accumulation. */ struct MSMWorkUnit { size_t batch_msm_index = 0; @@ -42,15 +95,43 @@ template class MSM { }; using ThreadWorkUnits = std::vector; + /** + * @brief Container for MSM input data passed between algorithm stages + * @note scalars must be in NON-Montgomery form for correct bucket index computation + */ struct MSMData { - std::span scalars; - std::span points; - std::span scalar_indices; - std::span point_schedule; + std::span scalars; // Scalars (non-Montgomery form) + std::span points; // Input points + std::span scalar_indices; // Indices of nonzero scalars + std::span point_schedule; // Scratch space for point scheduling + + /** + * @brief Factory method to construct MSMData from a work unit + * @details Extracts the appropriate slices from the full arrays based on MSMWorkUnit parameters + */ + static MSMData from_work_unit(std::span> all_scalars, + std::span> all_points, + const std::vector>& all_indices, + std::span point_schedule_buffer, + const MSMWorkUnit& work_unit) noexcept + { + return MSMData{ + .scalars = all_scalars[work_unit.batch_msm_index], + .points = all_points[work_unit.batch_msm_index], + .scalar_indices = + std::span{ &all_indices[work_unit.batch_msm_index][work_unit.start_index], + work_unit.size }, + .point_schedule = point_schedule_buffer, + }; + } }; /** - * @brief Temp data structure, one created per thread! + * @brief Affine bucket accumulators for the fast affine-trick Pippenger variant + * @details Used when handle_edge_cases=false. Stores buckets in affine coordinates, + * enabling use of Montgomery's batch inversion trick. Does NOT handle + * edge cases like point doubling or point at infinity. + * @note Allocated per-call for WASM compatibility. */ struct BucketAccumulators { std::vector buckets; @@ -62,6 +143,13 @@ template class MSM { {} }; + /** + * @brief Jacobian bucket accumulators for the safe Pippenger variant + * @details Used when handle_edge_cases=true or when affine trick is not beneficial. + * Stores buckets in Jacobian coordinates which correctly handle point + * doubling and point at infinity edge cases. + * @note Allocated per-call (not thread_local) in the Jacobian Pippenger path. + */ struct JacobianBucketAccumulators { std::vector buckets; BitVector bucket_exists; @@ -72,81 +160,99 @@ template class MSM { {} }; /** - * @brief Temp data structure, one created per thread! + * @brief Scratch space for batched affine point additions (one per thread) */ struct AffineAdditionData { static constexpr size_t BATCH_SIZE = 2048; // when adding affine points, we have an edge case where the number of points in the batch can overflow by 2 static constexpr size_t BATCH_OVERFLOW_SIZE = 2; std::vector points_to_add; - std::vector scalar_scratch_space; - std::vector addition_result_bucket_destinations; + std::vector inversion_scratch_space; // Used for Montgomery batch inversion denominators + std::vector addition_result_bucket_destinations; + AffineElement null_location{}; // Dummy write target for branchless conditional moves AffineAdditionData() noexcept : points_to_add(BATCH_SIZE + BATCH_OVERFLOW_SIZE) - , scalar_scratch_space(BATCH_SIZE + BATCH_OVERFLOW_SIZE) + , inversion_scratch_space(BATCH_SIZE + BATCH_OVERFLOW_SIZE) , addition_result_bucket_destinations(((BATCH_SIZE + BATCH_OVERFLOW_SIZE) / 2)) {} }; - static size_t get_num_rounds(size_t num_points) noexcept + + /** + * @brief Packed point schedule entry: (point_index << 32) | bucket_index + * @details Used to sort points by their target bucket for cache-efficient processing + */ + struct PointScheduleEntry { + uint64_t data; + + [[nodiscard]] static constexpr PointScheduleEntry create(uint32_t point_index, uint32_t bucket_index) noexcept + { + return { (static_cast(point_index) << 32) | bucket_index }; + } + [[nodiscard]] constexpr uint32_t point_index() const noexcept { return static_cast(data >> 32); } + [[nodiscard]] constexpr uint32_t bucket_index() const noexcept { return static_cast(data); } + }; + + // ======================= Public Methods ======================= + // See README.md for algorithm details and mathematical derivations. + + /** + * @brief Main entry point for single MSM computation + * @param handle_edge_cases false (default): fast affine variant; true: safe Jacobian variant + * @note Scalars are temporarily modified but restored before returning + */ + static AffineElement msm(std::span points, + PolynomialSpan scalars, + bool handle_edge_cases = false) noexcept; + + /** + * @brief Compute multiple MSMs in parallel with work balancing + * @note Scalars are temporarily modified but restored before returning + * @see README.md "Parallelization" + */ + static std::vector batch_multi_scalar_mul(std::span> points, + std::span> scalars, + bool handle_edge_cases = true) noexcept; + + // ======================= Test-Visible Methods ======================= + // Exposed for unit testing; not part of the public API. + + static uint32_t get_num_rounds(size_t num_points) noexcept { - const size_t bits_per_slice = get_optimal_log_num_buckets(num_points); - const size_t num_rounds = (NUM_BITS_IN_FIELD + (bits_per_slice - 1)) / bits_per_slice; - return num_rounds; + const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points); + return static_cast((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice); } + + /** @brief Batch add n/2 independent point pairs using Montgomery's trick */ static void add_affine_points(AffineElement* points, const size_t num_points, typename Curve::BaseField* scratch_space) noexcept; - static void transform_scalar_and_get_nonzero_scalar_indices(std::span scalars, - std::vector& consolidated_indices) noexcept; - static std::vector get_work_units(std::span> scalars, - std::vector>& msm_scalar_indices) noexcept; - static uint32_t get_scalar_slice(const ScalarField& scalar, size_t round, size_t normal_slice_size) noexcept; - static size_t get_optimal_log_num_buckets(const size_t num_points) noexcept; - static bool use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept; - - static Element small_pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept; - static Element pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept; - static Element evaluate_small_pippenger_round(MSMData& msm_data, - const size_t round_index, - JacobianBucketAccumulators& bucket_data, - Element previous_round_output, - const size_t bits_per_slice) noexcept; - - static Element evaluate_pippenger_round(MSMData& msm_data, - const size_t round_index, - AffineAdditionData& affine_data, - BucketAccumulators& bucket_data, - Element previous_round_output, - const size_t bits_per_slice) noexcept; - - static void consume_point_schedule(std::span point_schedule, - std::span points, - AffineAdditionData& affine_data, - BucketAccumulators& bucket_data, - size_t num_input_points_processed, - size_t num_queued_affine_points) noexcept; + /** @brief Extract c-bit slice from scalar for bucket index computation */ + static uint32_t get_scalar_slice(const ScalarField& scalar, size_t round, size_t slice_size) noexcept; - static std::vector batch_multi_scalar_mul(std::span> points, - std::span> scalars, - bool handle_edge_cases = true) noexcept; - static AffineElement msm(std::span points, - PolynomialSpan _scalars, - bool handle_edge_cases = false) noexcept; + /** @brief Compute optimal bits per slice by minimizing cost over c in [1, MAX_SLICE_BITS) */ + static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept; + /** @brief Process sorted point schedule into bucket accumulators using batched affine additions */ + static void batch_accumulate_points_into_buckets(std::span point_schedule, + std::span points, + AffineAdditionData& affine_data, + BucketAccumulators& bucket_data) noexcept; + + /** @brief Reduce buckets to single point using running (suffix) sum from high to low: R = sum(k * B_k) */ template static Element accumulate_buckets(BucketType& bucket_accumulators) noexcept { auto& buckets = bucket_accumulators.buckets; BB_ASSERT_DEBUG(buckets.size() > static_cast(0)); int starting_index = static_cast(buckets.size() - 1); - Element prefix_sum; + Element running_sum; bool found_start = false; while (!found_start && starting_index > 0) { const size_t idx = static_cast(starting_index); if (bucket_accumulators.bucket_exists.get(idx)) { - prefix_sum = buckets[idx]; + running_sum = buckets[idx]; found_start = true; } else { starting_index -= 1; @@ -156,32 +262,104 @@ template class MSM { return Curve::Group::point_at_infinity; } BB_ASSERT_DEBUG(starting_index > 0); - AffineElement offset_generator = Curve::Group::affine_point_at_infinity; - if constexpr (std::same_as) { - constexpr auto gen = get_precomputed_generators()[0]; - offset_generator = gen; - } else { - constexpr auto gen = get_precomputed_generators()[0]; - offset_generator = gen; - } - Element sum = prefix_sum + offset_generator; - for (int i = static_cast(starting_index - 1); i > 0; --i) { + const auto& offset_generator = get_offset_generator(); + Element sum = running_sum + offset_generator; + for (int i = starting_index - 1; i > 0; --i) { size_t idx = static_cast(i); BB_ASSERT_DEBUG(idx < bucket_accumulators.bucket_exists.size()); if (bucket_accumulators.bucket_exists.get(idx)) { - - prefix_sum += buckets[idx]; + running_sum += buckets[idx]; } - sum += prefix_sum; + sum += running_sum; } return sum - offset_generator; } + + private: + // ======================= Private Implementation ======================= + + /** @brief Convert scalars from Montgomery form and collect indices of nonzero scalars */ + static void transform_scalar_and_get_nonzero_scalar_indices(std::span scalars, + std::vector& nonzero_scalar_indices) noexcept; + + /** @brief Distribute multiple MSMs across threads with balanced point counts */ + static std::vector get_work_units(std::span> scalars, + std::vector>& msm_scalar_indices) noexcept; + + /** @brief Decide if batch inversion saves work vs Jacobian additions */ + static bool use_affine_trick(size_t num_points, size_t num_buckets) noexcept; + + /** @brief Pippenger using Jacobian buckets (handles edge cases: doubling, infinity) */ + static Element jacobian_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept; + + /** @brief Pippenger using affine buckets with batch inversion (faster, no edge case handling) */ + static Element affine_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept; + + // Helpers for batch_accumulate_points_into_buckets. Inlined for performance. + + // Process single point: if bucket has accumulator, pair them for addition; else cache in bucket. + __attribute__((always_inline)) static void process_single_point(size_t bucket, + const AffineElement* point_source, + AffineAdditionData& affine_data, + BucketAccumulators& bucket_data, + size_t& scratch_it, + size_t& point_it) noexcept + { + bool has_accumulator = bucket_data.bucket_exists.get(bucket); + if (has_accumulator) { + affine_data.points_to_add[scratch_it] = *point_source; + affine_data.points_to_add[scratch_it + 1] = bucket_data.buckets[bucket]; + bucket_data.bucket_exists.set(bucket, false); + affine_data.addition_result_bucket_destinations[scratch_it >> 1] = static_cast(bucket); + scratch_it += 2; + } else { + bucket_data.buckets[bucket] = *point_source; + bucket_data.bucket_exists.set(bucket, true); + } + point_it += 1; + } + + // Branchless bucket pair processing. Updates point_it (by 2 if same bucket, else 1) and scratch_it. + // See README.md "batch_accumulate_points_into_buckets Algorithm" for case analysis. + __attribute__((always_inline)) static void process_bucket_pair(size_t lhs_bucket, + size_t rhs_bucket, + const AffineElement* lhs_source, + const AffineElement* rhs_source_if_match, + AffineAdditionData& affine_data, + BucketAccumulators& bucket_data, + size_t& scratch_it, + size_t& point_it) noexcept + { + bool has_bucket_accumulator = bucket_data.bucket_exists.get(lhs_bucket); + bool buckets_match = lhs_bucket == rhs_bucket; + bool do_affine_add = buckets_match || has_bucket_accumulator; + + const AffineElement* rhs_source = buckets_match ? rhs_source_if_match : &bucket_data.buckets[lhs_bucket]; + + AffineElement* lhs_destination = + do_affine_add ? &affine_data.points_to_add[scratch_it] : &bucket_data.buckets[lhs_bucket]; + AffineElement* rhs_destination = + do_affine_add ? &affine_data.points_to_add[scratch_it + 1] : &affine_data.null_location; + + uint32_t& dest_bucket = affine_data.addition_result_bucket_destinations[scratch_it >> 1]; + dest_bucket = do_affine_add ? static_cast(lhs_bucket) : dest_bucket; + + *lhs_destination = *lhs_source; + *rhs_destination = *rhs_source; + + bucket_data.bucket_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add); + scratch_it += do_affine_add ? 2 : 0; + point_it += (do_affine_add && buckets_match) ? 2 : 1; + } }; +/** @brief Safe MSM wrapper (defaults to handle_edge_cases=true) */ template typename Curve::Element pippenger(PolynomialSpan scalars, std::span points, bool handle_edge_cases = true) noexcept; + +/** @brief Fast MSM wrapper for linearly independent points (no edge case handling) */ template typename Curve::Element pippenger_unsafe(PolynomialSpan scalars, std::span points) noexcept; @@ -189,5 +367,4 @@ typename Curve::Element pippenger_unsafe(PolynomialSpan; extern template class MSM; -// NEXT STEP ACCUMULATE BUVKETS } // namespace bb::scalar_multiplication diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp index f1b79f2d1f3c..b1d2ee42801a 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp @@ -55,6 +55,7 @@ template class ScalarMultiplicationTest : public ::testing::Test { } return AffineElement(expected_acc); } + static void SetUpTestSuite() { generators.resize(num_points); @@ -69,502 +70,573 @@ template class ScalarMultiplicationTest : public ::testing::Test { ASSERT_EQ(generators[i].x == generators[i + 1].x, false); } }; -}; -using CurveTypes = ::testing::Types; -TYPED_TEST_SUITE(ScalarMultiplicationTest, CurveTypes); + // ======================= Test Methods ======================= -#define SCALAR_MULTIPLICATION_TYPE_ALIASES \ - using Curve = TypeParam; \ - using ScalarField = typename Curve::ScalarField; \ - // using AffineElement = typename Curve::AffineEleent; + void test_get_scalar_slice() + { + constexpr uint32_t fr_size = 254; + constexpr uint32_t slice_bits = 7; + constexpr uint32_t num_slices = (fr_size + 6) / 7; + constexpr uint32_t last_slice_bits = fr_size - ((num_slices - 1) * slice_bits); + + for (size_t x = 0; x < 100; ++x) { + uint256_t input_u256 = engine.get_random_uint256(); + input_u256.data[3] = input_u256.data[3] & 0x3FFFFFFFFFFFFFFF; // 254 bits + while (input_u256 > ScalarField::modulus) { + input_u256 -= ScalarField::modulus; + } + std::vector slices(num_slices); + + uint256_t acc = input_u256; + for (uint32_t i = 0; i < num_slices; ++i) { + uint32_t mask = ((1U << slice_bits) - 1U); + uint32_t shift = slice_bits; + if (i == 0) { + mask = ((1U << last_slice_bits) - 1U); + shift = last_slice_bits; + } + slices[num_slices - 1 - i] = static_cast((acc & mask).data[0]); + acc = acc >> shift; + } -TYPED_TEST(ScalarMultiplicationTest, GetScalarSlice) -{ - SCALAR_MULTIPLICATION_TYPE_ALIASES - const size_t fr_size = 254; - const size_t slice_bits = 7; - size_t num_slices = (fr_size + 6) / 7; - size_t last_slice_bits = fr_size - ((num_slices - 1) * slice_bits); - - for (size_t x = 0; x < 100; ++x) { - - uint256_t input_u256 = engine.get_random_uint256(); - input_u256.data[3] = input_u256.data[3] & 0x3FFFFFFFFFFFFFFF; // 254 bits - while (input_u256 > ScalarField::modulus) { - input_u256 -= ScalarField::modulus; - } - std::vector slices(num_slices); - - uint256_t acc = input_u256; - for (size_t i = 0; i < num_slices; ++i) { - size_t mask = ((1 << slice_bits) - 1UL); - size_t shift = slice_bits; - if (i == 0) { - mask = ((1UL << last_slice_bits) - 1UL); - shift = last_slice_bits; + ScalarField input(input_u256); + input.self_from_montgomery_form(); + + ASSERT_EQ(input.data[0], input_u256.data[0]); + ASSERT_EQ(input.data[1], input_u256.data[1]); + ASSERT_EQ(input.data[2], input_u256.data[2]); + ASSERT_EQ(input.data[3], input_u256.data[3]); + + for (uint32_t i = 0; i < num_slices; ++i) { + uint32_t result = scalar_multiplication::MSM::get_scalar_slice(input, i, slice_bits); + EXPECT_EQ(result, slices[i]); } - slices[num_slices - 1 - i] = static_cast((acc & mask).data[0]); - acc = acc >> shift; } - // uint256_t input_u256 = 0; - - // for (size_t i = 0; i < num_slices; ++i) { - // bool valid_slice = false; - // while (!valid_slice) { - // size_t mask = ((1 << slice_bits) - 1); - // if (i == num_slices - 1) { - // mask = ((1 << last_slice_bits) - 1); - // } - // const uint32_t slice = engine.get_random_uint32() & mask; - - // size_t shift = (fr_size - slice_bits - (i * slice_bits)); - // if (i == num_slices - 1) { - // shift = 0; - // } - - // const uint256_t new_input_u256 = input_u256 + (uint256_t(slice) << shift); - // // BB_ASSERT(new_input_u256 < fr::modulus); - // if (new_input_u256 < fr::modulus) { - // input_u256 = new_input_u256; - // slices[i] = slice; - // valid_slice = true; - // } - // } - // } - - // BB_ASSERT(input_u256 < fr::modulus); - // while (input_u256 > fr::modulus) { - // input_u256 -= fr::modulus; - // } - ScalarField input(input_u256); - input.self_from_montgomery_form(); - - ASSERT_EQ(input.data[0], input_u256.data[0]); - ASSERT_EQ(input.data[1], input_u256.data[1]); - ASSERT_EQ(input.data[2], input_u256.data[2]); - ASSERT_EQ(input.data[3], input_u256.data[3]); - - for (size_t i = 0; i < num_slices; ++i) { - - uint32_t result = scalar_multiplication::MSM::get_scalar_slice(input, i, slice_bits); - EXPECT_EQ(result, slices[i]); + } + + void test_consume_point_batch() + { + const size_t total_points = 30071; + const size_t num_buckets = 128; + + std::vector input_point_schedule; + for (size_t i = 0; i < total_points; ++i) { + uint64_t bucket = static_cast(engine.get_random_uint8()) & 0x7f; + uint64_t schedule = static_cast(bucket) + (static_cast(i) << 32); + input_point_schedule.push_back(schedule); + } + typename scalar_multiplication::MSM::AffineAdditionData affine_data; + typename scalar_multiplication::MSM::BucketAccumulators bucket_data(num_buckets); + scalar_multiplication::MSM::batch_accumulate_points_into_buckets( + input_point_schedule, generators, affine_data, bucket_data); + + std::vector expected_buckets(num_buckets); + for (auto& e : expected_buckets) { + e.self_set_infinity(); + } + for (size_t i = 0; i < total_points; ++i) { + uint64_t bucket = input_point_schedule[i] & 0xFFFFFFFF; + EXPECT_LT(static_cast(bucket), num_buckets); + expected_buckets[static_cast(bucket)] += generators[i]; + } + for (size_t i = 0; i < num_buckets; ++i) { + if (!expected_buckets[i].is_point_at_infinity()) { + AffineElement expected(expected_buckets[i]); + EXPECT_EQ(expected, bucket_data.buckets[i]); + } else { + EXPECT_FALSE(bucket_data.bucket_exists.get(i)); + } } } - // fr test = 0; - // test.data[0] = 0b; - // test.data[1] = 0b010101 -} -TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatch) -{ - using Curve = TypeParam; - using AffineElement = typename Curve::AffineElement; - // todo make this not a multiple of 10k - const size_t total_points = 30071; - const size_t num_buckets = 128; + void test_consume_point_batch_and_accumulate() + { + const size_t total_points = 30071; + const size_t num_buckets = 128; + + std::vector input_point_schedule; + for (size_t i = 0; i < total_points; ++i) { + uint64_t bucket = static_cast(engine.get_random_uint8()) & 0x7f; + uint64_t schedule = static_cast(bucket) + (static_cast(i) << 32); + input_point_schedule.push_back(schedule); + } + typename scalar_multiplication::MSM::AffineAdditionData affine_data; + typename scalar_multiplication::MSM::BucketAccumulators bucket_data(num_buckets); + scalar_multiplication::MSM::batch_accumulate_points_into_buckets( + input_point_schedule, generators, affine_data, bucket_data); - std::vector input_point_schedule; - for (size_t i = 0; i < total_points; ++i) { + Element result = scalar_multiplication::MSM::accumulate_buckets(bucket_data); - uint64_t bucket = static_cast(engine.get_random_uint8()) & 0x7f; + Element expected_acc; + expected_acc.self_set_infinity(); + size_t num_threads = get_num_cpus(); + std::vector expected_accs(num_threads); + size_t range_per_thread = (total_points + num_threads - 1) / num_threads; + parallel_for(num_threads, [&](size_t thread_idx) { + Element expected_thread_acc; + expected_thread_acc.self_set_infinity(); + size_t start = thread_idx * range_per_thread; + size_t end = (thread_idx == num_threads - 1) ? total_points : (thread_idx + 1) * range_per_thread; + bool skip = start >= total_points; + if (!skip) { + for (size_t i = start; i < end; ++i) { + ScalarField scalar = input_point_schedule[i] & 0xFFFFFFFF; + expected_thread_acc += generators[i] * scalar; + } + } + expected_accs[thread_idx] = expected_thread_acc; + }); - uint64_t schedule = static_cast(bucket) + (static_cast(i) << 32); - input_point_schedule.push_back(schedule); + for (size_t i = 0; i < num_threads; ++i) { + expected_acc += expected_accs[i]; + } + AffineElement expected(expected_acc); + EXPECT_EQ(AffineElement(result), expected); } - typename scalar_multiplication::MSM::AffineAdditionData affine_data = - typename scalar_multiplication::MSM::AffineAdditionData(); - typename scalar_multiplication::MSM::BucketAccumulators bucket_data(num_buckets); - scalar_multiplication::MSM::consume_point_schedule( - input_point_schedule, TestFixture::generators, affine_data, bucket_data, 0, 0); - - std::vector expected_buckets(num_buckets); - for (auto& e : expected_buckets) { - e.self_set_infinity(); + + void test_radix_sort_count_zero_entries() + { + const size_t total_points = 30071; + + std::vector input_point_schedule; + for (size_t i = 0; i < total_points; ++i) { + uint64_t bucket = static_cast(engine.get_random_uint8()) & 0x7f; + uint64_t schedule = static_cast(bucket) + (static_cast(i) << 32); + input_point_schedule.push_back(schedule); + } + + size_t result = scalar_multiplication::sort_point_schedule_and_count_zero_buckets( + &input_point_schedule[0], input_point_schedule.size(), 7); + + // Verify zero entry count is correct + size_t expected = 0; + for (size_t i = 0; i < total_points; ++i) { + expected += static_cast((input_point_schedule[i] & 0xFFFFFFFF) == 0); + } + EXPECT_EQ(result, expected); + + // Verify the array is sorted by bucket index (lower 32 bits) + for (size_t i = 1; i < total_points; ++i) { + uint32_t prev_bucket = static_cast(input_point_schedule[i - 1]); + uint32_t curr_bucket = static_cast(input_point_schedule[i]); + EXPECT_LE(prev_bucket, curr_bucket) << "Array not sorted at index " << i; + } } - // std::cout << "computing expected" << std::endl; - for (size_t i = 0; i < total_points; ++i) { - uint64_t bucket = input_point_schedule[i] & 0xFFFFFFFF; - EXPECT_LT(static_cast(bucket), num_buckets); - expected_buckets[static_cast(bucket)] += TestFixture::generators[i]; + + void test_pippenger_low_memory() + { + std::span test_scalars(&scalars[0], num_points); + AffineElement result = + scalar_multiplication::MSM::msm(generators, PolynomialSpan(0, test_scalars)); + AffineElement expected = naive_msm(test_scalars, generators); + EXPECT_EQ(result, expected); } - for (size_t i = 0; i < num_buckets; ++i) { - if (!expected_buckets[i].is_point_at_infinity()) { - AffineElement expected(expected_buckets[i]); - EXPECT_EQ(expected, bucket_data.buckets[i]); - } else { - EXPECT_FALSE(bucket_data.bucket_exists.get(i)); + + void test_batch_multi_scalar_mul() + { + BB_BENCH_NAME("BatchMultiScalarMul"); + + const size_t num_msms = static_cast(engine.get_random_uint8()); + std::vector expected(num_msms); + + std::vector> batch_scalars_copies(num_msms); + std::vector> batch_points_span; + std::vector> batch_scalars_spans; + + size_t vector_offset = 0; + for (size_t k = 0; k < num_msms; ++k) { + const size_t num_pts = static_cast(engine.get_random_uint16()) % 400; + + ASSERT_LT(vector_offset + num_pts, num_points); + std::span batch_points(&generators[vector_offset], num_pts); + + batch_scalars_copies[k].resize(num_pts); + for (size_t i = 0; i < num_pts; ++i) { + batch_scalars_copies[k][i] = scalars[vector_offset + i]; + } + + vector_offset += num_pts; + batch_points_span.push_back(batch_points); + batch_scalars_spans.push_back(batch_scalars_copies[k]); + + expected[k] = naive_msm(batch_scalars_spans[k], batch_points_span[k]); } + + std::vector result = + scalar_multiplication::MSM::batch_multi_scalar_mul(batch_points_span, batch_scalars_spans); + + EXPECT_EQ(result, expected); } -} -TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatchAndAccumulate) -{ - SCALAR_MULTIPLICATION_TYPE_ALIASES - using Element = typename Curve::Element; - using AffineElement = typename Curve::AffineElement; + void test_batch_multi_scalar_mul_sparse() + { + const size_t num_msms = 10; + std::vector expected(num_msms); - // todo make this not a multiple of 10k - const size_t total_points = 30071; - const size_t num_buckets = 128; + std::vector> batch_scalars(num_msms); + std::vector> batch_points_span; + std::vector> batch_scalars_spans; - std::vector input_point_schedule; - for (size_t i = 0; i < total_points; ++i) { + for (size_t k = 0; k < num_msms; ++k) { + const size_t num_pts = 33; + auto& test_scalars = batch_scalars[k]; - uint64_t bucket = static_cast(engine.get_random_uint8()) & 0x7f; + test_scalars.resize(num_pts); - uint64_t schedule = static_cast(bucket) + (static_cast(i) << 32); - input_point_schedule.push_back(schedule); - } - typename scalar_multiplication::MSM::AffineAdditionData affine_data = - typename scalar_multiplication::MSM::AffineAdditionData(); - typename scalar_multiplication::MSM::BucketAccumulators bucket_data(num_buckets); - scalar_multiplication::MSM::consume_point_schedule( - input_point_schedule, TestFixture::generators, affine_data, bucket_data, 0, 0); - - Element result = scalar_multiplication::MSM::accumulate_buckets(bucket_data); - - Element expected_acc = Element(); - expected_acc.self_set_infinity(); - size_t num_threads = get_num_cpus(); - std::vector expected_accs(num_threads); - size_t range_per_thread = (total_points + num_threads - 1) / num_threads; - parallel_for(num_threads, [&](size_t thread_idx) { - Element expected_thread_acc; - expected_thread_acc.self_set_infinity(); - size_t start = thread_idx * range_per_thread; - size_t end = (thread_idx == num_threads - 1) ? total_points : (thread_idx + 1) * range_per_thread; - bool skip = start >= total_points; - if (!skip) { - for (size_t i = start; i < end; ++i) { - ScalarField scalar = input_point_schedule[i] & 0xFFFFFFFF; - expected_thread_acc += TestFixture::generators[i] * scalar; + size_t fixture_offset = k * num_pts; + + std::span batch_points(&generators[fixture_offset], num_pts); + for (size_t i = 0; i < 13; ++i) { + test_scalars[i] = 0; + } + for (size_t i = 13; i < 23; ++i) { + test_scalars[i] = scalars[fixture_offset + i + 13]; + } + for (size_t i = 23; i < num_pts; ++i) { + test_scalars[i] = 0; } + batch_points_span.push_back(batch_points); + batch_scalars_spans.push_back(batch_scalars[k]); + + expected[k] = naive_msm(batch_scalars[k], batch_points); } - expected_accs[thread_idx] = expected_thread_acc; - }); - for (size_t i = 0; i < num_threads; ++i) { - expected_acc += expected_accs[i]; + std::vector result = + scalar_multiplication::MSM::batch_multi_scalar_mul(batch_points_span, batch_scalars_spans); + + EXPECT_EQ(result, expected); } - AffineElement expected(expected_acc); - EXPECT_EQ(AffineElement(result), expected); -} -TYPED_TEST(ScalarMultiplicationTest, RadixSortCountZeroEntries) -{ - const size_t total_points = 30071; + void test_msm() + { + const size_t start_index = 1234; + const size_t num_pts = num_points - start_index; + + PolynomialSpan scalar_span = + PolynomialSpan(start_index, std::span(&scalars[0], num_pts)); + AffineElement result = scalar_multiplication::MSM::msm(generators, scalar_span); - std::vector input_point_schedule; - for (size_t i = 0; i < total_points; ++i) { + std::span points(&generators[start_index], num_pts); + AffineElement expected = naive_msm(scalar_span.span, points); + EXPECT_EQ(result, expected); + } + + void test_msm_all_zeroes() + { + const size_t start_index = 1234; + const size_t num_pts = num_points - start_index; + std::vector test_scalars(num_pts, ScalarField::zero()); - uint64_t bucket = static_cast(engine.get_random_uint8()) & 0x7f; + PolynomialSpan scalar_span = PolynomialSpan(start_index, test_scalars); + AffineElement result = scalar_multiplication::MSM::msm(generators, scalar_span); - uint64_t schedule = static_cast(bucket) + (static_cast(i) << 32); - input_point_schedule.push_back(schedule); + EXPECT_EQ(result, Group::affine_point_at_infinity); } - size_t result = scalar_multiplication::process_buckets_count_zero_entries( - &input_point_schedule[0], input_point_schedule.size(), 7); - size_t expected = 0; - for (size_t i = 0; i < total_points; ++i) { - expected += static_cast((input_point_schedule[i] & 0xFFFFFFFF) == 0); + void test_msm_empty_polynomial() + { + std::vector test_scalars; + std::vector input_points; + PolynomialSpan scalar_span = PolynomialSpan(0, test_scalars); + AffineElement result = scalar_multiplication::MSM::msm(input_points, scalar_span); + + EXPECT_EQ(result, Group::affine_point_at_infinity); } - EXPECT_EQ(result, expected); -} -TYPED_TEST(ScalarMultiplicationTest, EvaluatePippengerRound) -{ - SCALAR_MULTIPLICATION_TYPE_ALIASES - using AffineElement = typename Curve::AffineElement; - using Element = typename Curve::Element; + void test_scalars_unchanged_after_msm() + { + const size_t num_pts = 100; + std::vector test_scalars(num_pts); + std::vector scalars_copy(num_pts); - const size_t num_points = 2; - std::vector scalars(num_points); - constexpr size_t NUM_BITS_IN_FIELD = fr::modulus.get_msb() + 1; - const size_t normal_slice_size = 7; // stop hardcoding - const size_t num_buckets = 1 << normal_slice_size; - - const size_t num_rounds = (NUM_BITS_IN_FIELD + normal_slice_size - 1) / normal_slice_size; - typename scalar_multiplication::MSM::AffineAdditionData affine_data = - typename scalar_multiplication::MSM::AffineAdditionData(); - typename scalar_multiplication::MSM::BucketAccumulators bucket_data(num_buckets); - - for (size_t round_index = num_rounds - 1; round_index < num_rounds; round_index++) { - const size_t num_bits_in_slice = - (round_index == (num_rounds - 1)) ? (NUM_BITS_IN_FIELD % normal_slice_size) : normal_slice_size; - for (size_t i = 0; i < num_points; ++i) { - - size_t hi_bit = NUM_BITS_IN_FIELD - (round_index * normal_slice_size); - size_t lo_bit = hi_bit - normal_slice_size; - if (hi_bit < normal_slice_size) { - lo_bit = 0; - } - uint64_t slice = engine.get_random_uint64() & ((1ULL << num_bits_in_slice) - 1); - // at this point in the algo, scalars has been converted out of montgomery form - uint256_t scalar = uint256_t(slice) << lo_bit; - scalars[i].data[0] = scalar.data[0]; - scalars[i].data[1] = scalar.data[1]; - scalars[i].data[2] = scalar.data[2]; - scalars[i].data[3] = scalar.data[3]; - scalars[i].self_to_montgomery_form(); + for (size_t i = 0; i < num_pts; ++i) { + test_scalars[i] = scalars[i]; + scalars_copy[i] = test_scalars[i]; } - std::vector indices; - scalar_multiplication::MSM::transform_scalar_and_get_nonzero_scalar_indices(scalars, indices); + std::span points(&generators[0], num_pts); + PolynomialSpan scalar_span(0, test_scalars); - Element previous_round_output; - previous_round_output.self_set_infinity(); - for (auto x : indices) { - ASSERT_LT(x, num_points); - } - std::vector point_schedule(scalars.size()); - typename scalar_multiplication::MSM::MSMData msm_data( - scalars, TestFixture::generators, indices, point_schedule); - Element result = scalar_multiplication::MSM::evaluate_pippenger_round( - msm_data, round_index, affine_data, bucket_data, previous_round_output, 7); - Element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - ScalarField baz = scalars[i].to_montgomery_form(); - expected += (TestFixture::generators[i] * baz); + scalar_multiplication::MSM::msm(points, scalar_span); + + for (size_t i = 0; i < num_pts; ++i) { + EXPECT_EQ(test_scalars[i], scalars_copy[i]) << "Scalar at index " << i << " was modified"; } - size_t num_doublings = NUM_BITS_IN_FIELD - (normal_slice_size * (round_index + 1)); - if (round_index == num_rounds - 1) { - num_doublings = 0; + } + + void test_scalars_unchanged_after_batch_multi_scalar_mul() + { + const size_t num_msms = 3; + const size_t num_pts = 100; + + std::vector> batch_scalars(num_msms); + std::vector> scalars_copies(num_msms); + std::vector> batch_points; + std::vector> batch_scalar_spans; + + for (size_t k = 0; k < num_msms; ++k) { + batch_scalars[k].resize(num_pts); + scalars_copies[k].resize(num_pts); + + for (size_t i = 0; i < num_pts; ++i) { + batch_scalars[k][i] = scalars[k * num_pts + i]; + scalars_copies[k][i] = batch_scalars[k][i]; + } + + batch_points.push_back(std::span(&generators[k * num_pts], num_pts)); + batch_scalar_spans.push_back(batch_scalars[k]); } - for (size_t i = 0; i < num_doublings; ++i) { - result.self_dbl(); + + scalar_multiplication::MSM::batch_multi_scalar_mul(batch_points, batch_scalar_spans); + + for (size_t k = 0; k < num_msms; ++k) { + for (size_t i = 0; i < num_pts; ++i) { + EXPECT_EQ(batch_scalars[k][i], scalars_copies[k][i]) + << "Scalar at MSM " << k << ", index " << i << " was modified"; + } } - EXPECT_EQ(AffineElement(result), AffineElement(expected)); } -} -TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory) -{ - SCALAR_MULTIPLICATION_TYPE_ALIASES - using AffineElement = typename Curve::AffineElement; + void test_scalar_one() + { + const size_t num_pts = 5; + std::vector test_scalars(num_pts, ScalarField::one()); + std::span points(&generators[0], num_pts); - const size_t num_points = TestFixture::num_points; + PolynomialSpan scalar_span(0, test_scalars); + AffineElement result = scalar_multiplication::MSM::msm(points, scalar_span); - std::span scalars(&TestFixture::scalars[0], num_points); - AffineElement result = - scalar_multiplication::MSM::msm(TestFixture::generators, PolynomialSpan(0, scalars)); + Element expected; + expected.self_set_infinity(); + for (size_t i = 0; i < num_pts; ++i) { + expected += points[i]; + } - AffineElement expected = TestFixture::naive_msm(scalars, TestFixture::generators); - EXPECT_EQ(result, expected); -} + EXPECT_EQ(result, AffineElement(expected)); + } -TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMul) -{ - BB_BENCH_NAME("BatchMultiScalarMul"); - SCALAR_MULTIPLICATION_TYPE_ALIASES - using AffineElement = typename Curve::AffineElement; + void test_scalar_minus_one() + { + const size_t num_pts = 5; + std::vector test_scalars(num_pts, -ScalarField::one()); + std::span points(&generators[0], num_pts); - const size_t num_msms = static_cast(engine.get_random_uint8()); - std::vector expected(num_msms); + PolynomialSpan scalar_span(0, test_scalars); + AffineElement result = scalar_multiplication::MSM::msm(points, scalar_span); - std::vector> batch_points_span; - std::vector> batch_scalars_spans; + Element expected; + expected.self_set_infinity(); + for (size_t i = 0; i < num_pts; ++i) { + expected -= points[i]; + } - size_t vector_offset = 0; - for (size_t k = 0; k < num_msms; ++k) { - const size_t num_points = static_cast(engine.get_random_uint16()) % 400; + EXPECT_EQ(result, AffineElement(expected)); + } - ASSERT_LT(vector_offset + num_points, TestFixture::num_points); - std::span batch_scalars(&TestFixture::scalars[vector_offset], num_points); - std::span batch_points(&TestFixture::generators[vector_offset], num_points); + void test_single_point() + { + std::vector test_scalars = { scalars[0] }; + std::span points(&generators[0], 1); - vector_offset += num_points; - batch_points_span.push_back(batch_points); - batch_scalars_spans.push_back(batch_scalars); + PolynomialSpan scalar_span(0, test_scalars); + AffineElement result = scalar_multiplication::MSM::msm(points, scalar_span); - expected[k] = TestFixture::naive_msm(batch_scalars_spans[k], batch_points_span[k]); + AffineElement expected(points[0] * test_scalars[0]); + EXPECT_EQ(result, expected); } - std::vector result = - scalar_multiplication::MSM::batch_multi_scalar_mul(batch_points_span, batch_scalars_spans); + void test_size_thresholds() + { + std::vector test_sizes = { 1, 2, 15, 16, 17, 50, 127, 128, 129, 256, 512 }; - EXPECT_EQ(result, expected); -} + for (size_t num_pts : test_sizes) { + ASSERT_LE(num_pts, num_points); -TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulSparse) -{ - SCALAR_MULTIPLICATION_TYPE_ALIASES - using AffineElement = typename Curve::AffineElement; + std::vector test_scalars(num_pts); + for (size_t i = 0; i < num_pts; ++i) { + test_scalars[i] = scalars[i]; + } - const size_t num_msms = 10; - std::vector expected(num_msms); + std::span points(&generators[0], num_pts); + PolynomialSpan scalar_span(0, test_scalars); - std::vector> batch_scalars(num_msms); - std::vector> batch_input_points(num_msms); - std::vector> batch_points_span; - std::vector> batch_scalars_spans; + AffineElement result = scalar_multiplication::MSM::msm(points, scalar_span); + AffineElement expected = naive_msm(test_scalars, points); - for (size_t k = 0; k < num_msms; ++k) { - const size_t num_points = 33; - auto& scalars = batch_scalars[k]; + EXPECT_EQ(result, expected) << "Failed for size " << num_pts; + } + } - scalars.resize(num_points); + void test_duplicate_points() + { + // Use enough points to trigger Pippenger (> PIPPENGER_THRESHOLD = 16) + const size_t num_pts = 32; + AffineElement base_point = generators[0]; - size_t fixture_offset = k * num_points; + std::vector points(num_pts, base_point); + std::vector test_scalars(num_pts); + ScalarField scalar_sum = ScalarField::zero(); - std::span batch_points(&TestFixture::generators[fixture_offset], num_points); - for (size_t i = 0; i < 13; ++i) { - scalars[i] = 0; - } - for (size_t i = 13; i < 23; ++i) { - scalars[i] = TestFixture::scalars[fixture_offset + i + 13]; - } - for (size_t i = 23; i < num_points; ++i) { - scalars[i] = 0; + for (size_t i = 0; i < num_pts; ++i) { + test_scalars[i] = scalars[i]; + scalar_sum += test_scalars[i]; } - batch_points_span.push_back(batch_points); - batch_scalars_spans.push_back(batch_scalars[k]); - expected[k] = TestFixture::naive_msm(batch_scalars[k], batch_points); + PolynomialSpan scalar_span(0, test_scalars); + // Duplicate points are an edge case (P + P requires doubling, not addition). + // Must use handle_edge_cases=true for correctness with Pippenger. + AffineElement result = scalar_multiplication::MSM::msm(points, scalar_span, /*handle_edge_cases=*/true); + + AffineElement expected(base_point * scalar_sum); + EXPECT_EQ(result, expected); } - std::vector result = - scalar_multiplication::MSM::batch_multi_scalar_mul(batch_points_span, batch_scalars_spans); + void test_mixed_zero_scalars() + { + const size_t num_pts = 100; + std::vector test_scalars(num_pts); + Element expected; + expected.self_set_infinity(); - EXPECT_EQ(result, expected); -} + for (size_t i = 0; i < num_pts; ++i) { + if (i % 2 == 0) { + test_scalars[i] = ScalarField::zero(); + } else { + test_scalars[i] = scalars[i]; + expected += generators[i] * test_scalars[i]; + } + } -TYPED_TEST(ScalarMultiplicationTest, MSM) -{ - SCALAR_MULTIPLICATION_TYPE_ALIASES - using AffineElement = typename Curve::AffineElement; + std::span points(&generators[0], num_pts); + PolynomialSpan scalar_span(0, test_scalars); - const size_t start_index = 1234; - const size_t num_points = TestFixture::num_points - start_index; + AffineElement result = scalar_multiplication::MSM::msm(points, scalar_span); + EXPECT_EQ(result, AffineElement(expected)); + } - PolynomialSpan scalar_span = - PolynomialSpan(start_index, std::span(&TestFixture::scalars[0], num_points)); - AffineElement result = scalar_multiplication::MSM::msm(TestFixture::generators, scalar_span); + void test_pippenger_free_function() + { + const size_t num_pts = 200; + std::vector test_scalars(num_pts); + for (size_t i = 0; i < num_pts; ++i) { + test_scalars[i] = scalars[i]; + } - std::span points(&TestFixture::generators[start_index], num_points); - AffineElement expected = TestFixture::naive_msm(scalar_span.span, points); - EXPECT_EQ(result, expected); -} + std::span points(&generators[0], num_pts); + PolynomialSpan scalar_span(0, test_scalars); -TYPED_TEST(ScalarMultiplicationTest, MSMAllZeroes) -{ - SCALAR_MULTIPLICATION_TYPE_ALIASES - using AffineElement = typename Curve::AffineElement; + auto result = scalar_multiplication::pippenger(scalar_span, points); - const size_t start_index = 1234; - const size_t num_points = TestFixture::num_points - start_index; - std::vector scalars(num_points); + AffineElement expected = naive_msm(test_scalars, points); + EXPECT_EQ(AffineElement(result), expected); + } + + void test_pippenger_unsafe_free_function() + { + const size_t num_pts = 200; + std::vector test_scalars(num_pts); + for (size_t i = 0; i < num_pts; ++i) { + test_scalars[i] = scalars[i]; + } - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = 0; + std::span points(&generators[0], num_pts); + PolynomialSpan scalar_span(0, test_scalars); + + auto result = scalar_multiplication::pippenger_unsafe(scalar_span, points); + + AffineElement expected = naive_msm(test_scalars, points); + EXPECT_EQ(AffineElement(result), expected); } +}; - PolynomialSpan scalar_span = PolynomialSpan(start_index, scalars); - AffineElement result = scalar_multiplication::MSM::msm(TestFixture::generators, scalar_span); +using CurveTypes = ::testing::Types; +TYPED_TEST_SUITE(ScalarMultiplicationTest, CurveTypes); - EXPECT_EQ(result, Curve::Group::affine_point_at_infinity); -} +// ======================= Test Wrappers ======================= +TYPED_TEST(ScalarMultiplicationTest, GetScalarSlice) +{ + this->test_get_scalar_slice(); +} +TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatch) +{ + this->test_consume_point_batch(); +} +TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatchAndAccumulate) +{ + this->test_consume_point_batch_and_accumulate(); +} +TYPED_TEST(ScalarMultiplicationTest, RadixSortCountZeroEntries) +{ + this->test_radix_sort_count_zero_entries(); +} +TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory) +{ + this->test_pippenger_low_memory(); +} +TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMul) +{ + this->test_batch_multi_scalar_mul(); +} +TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulSparse) +{ + this->test_batch_multi_scalar_mul_sparse(); +} +TYPED_TEST(ScalarMultiplicationTest, MSM) +{ + this->test_msm(); +} +TYPED_TEST(ScalarMultiplicationTest, MSMAllZeroes) +{ + this->test_msm_all_zeroes(); +} TYPED_TEST(ScalarMultiplicationTest, MSMEmptyPolynomial) { - SCALAR_MULTIPLICATION_TYPE_ALIASES - using AffineElement = typename Curve::AffineElement; - - const size_t num_points = 0; - std::vector scalars(num_points); - std::vector input_points(num_points); - PolynomialSpan scalar_span = PolynomialSpan(0, scalars); - AffineElement result = scalar_multiplication::MSM::msm(input_points, scalar_span); - - EXPECT_EQ(result, Curve::Group::affine_point_at_infinity); + this->test_msm_empty_polynomial(); } - -// Helper function to generate scalars with specified sparsity -template -std::vector generate_sparse_scalars(size_t num_scalars, double sparsity_rate, auto& rng) +TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterMSM) { - std::vector scalars(num_scalars); - for (size_t i = 0; i < num_scalars; ++i) { - // Generate random value to determine if this scalar should be zero - double rand_val = static_cast(rng.get_random_uint32()) / static_cast(UINT32_MAX); - if (rand_val < sparsity_rate) { - scalars[i] = 0; - } else { - scalars[i] = ScalarField::random_element(&rng); - } - } - return scalars; + this->test_scalars_unchanged_after_msm(); } - -// Test different MSM strategies with detailed benchmarking -// NOTE this requres BB_BENCH=1 to be set before the test command -TYPED_TEST(ScalarMultiplicationTest, BenchBatchMsm) +TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterBatchMultiScalarMul) { -#ifndef __wasm__ - if (!bb::detail::use_bb_bench) { -#else - { -#endif - std::cout - << "Skipping BatchMultiScalarMulStrategyComparison as BB_BENCH=1 is not passed (OR we are in wasm).\n"; - return; - } - SCALAR_MULTIPLICATION_TYPE_ALIASES - - using AffineElement = typename Curve::AffineElement; - - const size_t num_msms = 3; - const size_t msm_max_size = 1 << 17; - const double max_sparsity = 0.1; - - // Generate test data with varying sparsity - std::vector> all_points; - std::vector> all_scalars; - std::vector all_commitments; - std::vector> scalar_storage; - - for (size_t i = 0; i < num_msms; ++i) { - // Generate random sizes and density of 0s - const size_t size = engine.get_random_uint64() % msm_max_size; - const double sparsity = engine.get_random_uint8() / 255.0 * max_sparsity; - auto scalars = generate_sparse_scalars(size, sparsity, engine); - scalar_storage.push_back(std::move(scalars)); - - std::span points(&TestFixture::generators[i], size); - all_points.push_back(points); - all_scalars.push_back(scalar_storage.back()); - all_commitments.push_back(TestFixture::naive_msm(all_scalars.back(), all_points.back())); - } - auto func = [&](size_t num_threads) { - set_parallel_for_concurrency(num_threads); - // Strategy 1: Individual MSMs - { - BB_BENCH_NAME((bb::detail::concat())); - for (size_t i = 0; i < num_msms; ++i) { - std::vector> single_points = { all_points[i] }; - std::vector> single_scalars = { all_scalars[i] }; - auto result = scalar_multiplication::MSM::batch_multi_scalar_mul(single_points, single_scalars); - EXPECT_EQ(result[0], all_commitments[i]); - } - } - // Strategy 2: Batch - { - BB_BENCH_NAME((bb::detail::concat())); - auto result = scalar_multiplication::MSM::batch_multi_scalar_mul(all_points, all_scalars); - EXPECT_EQ(result, all_commitments); - } - }; - // call lambda with template param - func.template operator()<"1 thread ">(1); - func.template operator()<"2 threads ">(2); - func.template operator()<"4 threads ">(4); - func.template operator()<"8 threads ">(8); - func.template operator()<"16 threads ">(16); - func.template operator()<"32 threads ">(32); + this->test_scalars_unchanged_after_batch_multi_scalar_mul(); +} +TYPED_TEST(ScalarMultiplicationTest, ScalarOne) +{ + this->test_scalar_one(); +} +TYPED_TEST(ScalarMultiplicationTest, ScalarMinusOne) +{ + this->test_scalar_minus_one(); +} +TYPED_TEST(ScalarMultiplicationTest, SinglePoint) +{ + this->test_single_point(); +} +TYPED_TEST(ScalarMultiplicationTest, SizeThresholds) +{ + this->test_size_thresholds(); +} +TYPED_TEST(ScalarMultiplicationTest, DuplicatePoints) +{ + this->test_duplicate_points(); +} +TYPED_TEST(ScalarMultiplicationTest, MixedZeroScalars) +{ + this->test_mixed_zero_scalars(); +} +TYPED_TEST(ScalarMultiplicationTest, PippengerFreeFunction) +{ + this->test_pippenger_free_function(); +} +TYPED_TEST(ScalarMultiplicationTest, PippengerUnsafeFreeFunction) +{ + this->test_pippenger_unsafe_free_function(); } +// Non-templated test for explicit small inputs TEST(ScalarMultiplication, SmallInputsExplicit) { uint256_t x0(0x68df84429941826a, 0xeb08934ed806781c, 0xc14b6a2e4f796a73, 0x08dc1a9a11a3c8db); diff --git a/barretenberg/cpp/src/barretenberg/srs/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/srs/scalar_multiplication.test.cpp index 433fcc95056d..db44c3976c92 100644 --- a/barretenberg/cpp/src/barretenberg/srs/scalar_multiplication.test.cpp +++ b/barretenberg/cpp/src/barretenberg/srs/scalar_multiplication.test.cpp @@ -121,9 +121,9 @@ TYPED_TEST(ScalarMultiplicationTests, RadixSort) Fr::__copy(source_scalar, scalars[i]); } - size_t bits_per_slice = scalar_multiplication::MSM::get_optimal_log_num_buckets(target_degree); + uint32_t bits_per_slice = scalar_multiplication::MSM::get_optimal_log_num_buckets(target_degree); - for (size_t i = 0; i < num_rounds; ++i) { + for (uint32_t i = 0; i < num_rounds; ++i) { std::vector scalar_slices(target_degree); std::vector sorted_scalar_slices(target_degree); @@ -132,7 +132,7 @@ TYPED_TEST(ScalarMultiplicationTests, RadixSort) scalar_slices[j] = scalar_multiplication::MSM::get_scalar_slice(scalars[j], i, bits_per_slice); sorted_scalar_slices[j] = scalar_slices[j]; } - scalar_multiplication::process_buckets_count_zero_entries( + scalar_multiplication::sort_point_schedule_and_count_zero_buckets( &sorted_scalar_slices[0], target_degree, static_cast(bits_per_slice)); const auto find_entry = [scalar_slices, num_entries = target_degree](auto x) {