diff --git a/barretenberg/cpp/CLAUDE.md b/barretenberg/cpp/CLAUDE.md
index dbe3aaa51ad1..e6d98e4776ca 100644
--- a/barretenberg/cpp/CLAUDE.md
+++ b/barretenberg/cpp/CLAUDE.md
@@ -2,6 +2,14 @@ succint aztec-packages cheat sheet.
 
 THE PROJECT ROOT IS AT TWO LEVELS ABOVE THIS FOLDER. Typically, the repository is at ~/aztec-packages. all advice is from the root.
 
+# Git workflow for barretenberg
+
+**IMPORTANT**: When comparing branches or looking at diffs for barretenberg work, use `merge-train/barretenberg` as the base branch, NOT `master`. The master branch is often outdated for barretenberg development.
+
+Examples:
+- `git diff merge-train/barretenberg...HEAD` (not `git diff master...HEAD`)
+- `git log merge-train/barretenberg..HEAD` (not `git log master..HEAD`)
+
 Run ./bootstrap.sh at the top-level to be sure the repo fully builds.
 Bootstrap scripts can be called with relative paths e.g. ../barretenberg/bootstrap.sh
 
diff --git a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
index 630d928089d0..748bb978fef4 100755
--- a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
+++ b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
@@ -16,7 +16,7 @@ PRESET=${3:-clang20}
 BUILD_DIR=${4:-build}
 HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}
 
-BASELINE_BRANCH="master"
+BASELINE_BRANCH="${BASELINE_BRANCH:-merge-train/barretenberg}"
 BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools"
 
 if [ ! -z "$(git status --untracked-files=no --porcelain)" ]; then
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md
new file mode 100644
index 000000000000..17e135237eeb
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/README.md
@@ -0,0 +1,183 @@
+# Pippenger Multi-Scalar Multiplication (MSM)
+
+## Overview
+
+The Pippenger algorithm computes multi-scalar multiplications:
+
+$$\text{MSM}(\vec{s}, \vec{P}) = \sum_{i=0}^{n-1} s_i \cdot P_i$$
+
+**Complexity**: Let $q = \lceil \log_2(\text{field modulus}) \rceil$ be the scalar bit-length, $|A|$ the cost of a group addition, and $|D|$ the cost of a doubling.
+
+- **Pippenger**: $O\left(\frac{q}{c} \cdot \left((n + 2^c) \cdot |A| + c \cdot |D|\right)\right)$
+- **Naive**: $O(n \cdot q \cdot |D| + n \cdot q \cdot |A| / 2)$
+
+With $c \approx \frac{1}{2} \log_2 n$, Pippenger achieves roughly $O(n \cdot q / \log n)$ vs $O(n \cdot q)$ for naive scalar multiplication.
+
+## Algorithm
+
+### Step 1: Scalar Decomposition
+
+**Implementation**: `get_scalar_slice(scalar, round_index, bits_per_slice)`
+
+Each scalar $s_i$ is decomposed into $r$ slices of $c$ bits each, processed **MSB-first**:
+
+$$s_i = \sum_{j=0}^{r-1} s_i^{(j)} \cdot 2^{c(r-1-j)}$$
+
+- $c$ = bits per slice (from `get_optimal_log_num_buckets`, which brute-force searches for minimum cost)
+- $r = \lceil $ `NUM_BITS_IN_FIELD` $/ c \rceil$ = number of rounds
+- Round 0 extracts the most significant bits
+
+### Step 2: Bucket Accumulation
+
+For each round $j$, points are added into **buckets** based on their scalar slice. Bucket $k$ accumulates all points whose slice value equals $k$:
+
+$$B_k^{(j)} = \sum_{\{i : s_i^{(j)} = k\}} P_i$$
+
+**Two implementation paths:**
+
+- **Affine**: Sorts points by bucket and uses batched affine additions
+- **Jacobian**: Direct bucket accumulation in Jacobian coordinates
+
+### Step 3: Bucket Reduction
+
+**Implementation**: `accumulate_buckets(bucket_accumulators)`
+
+Computes weighted sum using a suffix sum (high to low):
+
+$$R^{(j)} = \sum_{k=1}^{2^c - 1} k \cdot B_k^{(j)} = \sum_{k=1}^{2^c - 1} \left( \sum_{m=k}^{2^c - 1} B_m^{(j)} \right)$$
+
+An offset generator is added and subtracted to avoid rare accumulator edge cases—a probabilistic mitigation that simplifies accumulation logic.
+
+### Step 4: Round Combination
+
+Combines all rounds using Horner's method (MSB-first):
+
+```cpp
+msm_accumulator = point_at_infinity
+for j = 0 to r-1:
+    repeat c doublings (or fewer for final round)
+    msm_accumulator += bucket_result[j]
+```
+
+## Algorithm Variants
+
+### Entry Points and Safety
+
+| Entry Point | Default | Safety |
+|-------------|---------|--------|
+| `msm()` | `handle_edge_cases=false` | ⚠️ **Unsafe** |
+| `pippenger()` | `handle_edge_cases=true` | ✓ Safe |
+| `pippenger_unsafe()` | `handle_edge_cases=false` | ⚠️ Unsafe |
+| `batch_multi_scalar_mul()` | `handle_edge_cases=true` | ✓ Safe |
+
+### Edge Cases
+
+Affine addition fails for **P = Q** (doubling), **P = −Q** (inverse), and **P = O** (identity). Jacobian coordinates handle these correctly at higher cost (~2-3× slower).
+
+⚠️ **Use `msm()` or `pippenger_unsafe()` only when points are guaranteed linearly independent** (e.g., SRS points). For user-controlled or potentially duplicate points, use `pippenger()`.
+
+### Affine Pippenger (`handle_edge_cases=false`)
+
+Uses affine coordinates with Montgomery's batch inversion trick: replaces $m$ inversions with **1 inversion + O(m) multiplications**, yielding ~2-3× speedup over Jacobian.
+
+### Jacobian Pippenger (`handle_edge_cases=true`)
+
+Uses Jacobian coordinates for bucket accumulators. Handles all edge cases correctly.
+
+## Tuning Constants
+
+| Constant | Value | Purpose |
+|----------|-------|---------|
+| `PIPPENGER_THRESHOLD` | 16 | Below this, use naive scalar multiplication |
+| `AFFINE_TRICK_THRESHOLD` | 128 | Below this, batch inversion overhead exceeds savings |
+| `MAX_SLICE_BITS` | 20 | Upper bound on bucket count exponent |
+| `BATCH_SIZE` | 2048 | Points per batch inversion (fits L2 cache) |
+| `RADIX_BITS` | 8 | Bits per radix sort pass |
+
+<details>
+<summary>Cost model constants and derivations</summary>
+
+| Constant | Value | Derivation |
+|----------|-------|------------|
+| `BUCKET_ACCUMULATION_COST` | 5 | 2 Jacobian adds/bucket × 2.5× cost ratio |
+| `AFFINE_TRICK_SAVINGS_PER_OP` | 5 | ~10 muls saved − ~3 muls for product tree |
+| `JACOBIAN_Z_NOT_ONE_PENALTY` | 5 | Extra field ops when Z ≠ 1 |
+| `INVERSION_TABLE_COST` | 14 | 4-bit lookup table for modular exp |
+
+**BATCH_SIZE=2048**: Each `AffineElement` is 64 bytes. 2048 points = 128 KB, fitting in L2 cache.
+
+**RADIX_BITS=8**: 256 radix buckets × 4 bytes = 1 KB counting array, fits in L1 cache.
+
+</details>
+
+## Implementation Notes
+
+### Zero Scalar Filtering
+
+`transform_scalar_and_get_nonzero_scalar_indices` filters out zero scalars before processing (since $0 \cdot P_i = \mathcal{O}$). Scalars are converted from Montgomery form in-place to avoid doubling memory usage.
+
+### Bucket Existence Tracking
+
+A `BitVector` bitmap tracks which buckets are populated, avoiding expensive full-array clears between rounds. Clearing the bitmap costs $O(2^c / 64)$ words vs $O(2^c)$ for the full bucket array.
+
+### Point Scheduling (Affine Variant Only)
+
+Entries are packed as `(point_index << 32) | bucket_index` into 64-bit values. Since bucket indices fit in $c$ bits (typically 8-16), they occupy only the lowest bits of the packed entry. An **in-place MSD radix sort** on the low $c$ bits groups points by bucket for efficient batch processing. The sort also detects entries with `bucket_index == 0` during the final radix pass, allowing zero-bucket entries to be skipped without a separate scan.
+
+### Batched Affine Addition
+
+`batch_accumulate_points_into_buckets` processes sorted points iteratively:
+- Same-bucket pairs → queue for batch addition
+- Different buckets → cache in bucket or queue with existing accumulator
+- Uses branchless conditional moves to minimize pipeline stalls
+- Prefetches future points to hide memory latency
+- Recirculates results to maximize batch efficiency before writing to buckets
+
+<details>
+<summary>Batch accumulation case analysis</summary>
+
+| Condition | Action | Iterator Update |
+|-----------|--------|-----------------|
+| `bucket[i] == bucket[i+1]` | Queue both points for batch add | `point_it += 2` |
+| Different buckets, accumulator exists | Queue point + accumulator | `point_it += 1` |
+| Different buckets, no accumulator | Cache point into bucket | `point_it += 1` |
+
+After batch addition, results targeting the same bucket are paired again before writing to bucket accumulators, reducing random memory access by ~50%.
+
+</details>
+
+## Parallelization
+
+Uses **per-thread buffers** (bucket accumulators, scratch space) to eliminate contention.
+
+For `batch_multi_scalar_mul()`, work is distributed via `MSMWorkUnit` structures that can split a single MSM across multiple threads. Each thread computes partial results on point subsets, combined in a final reduction.
+
+<details>
+<summary>Per-call buffer sizes</summary>
+
+| Buffer | Size | Purpose |
+|--------|------|---------|
+| `BucketAccumulators` (affine) | $2^c × 64$ bytes | Affine bucket array + bitmap |
+| `JacobianBucketAccumulators` | $2^c × 96$ bytes | Jacobian bucket array + bitmap |
+| `AffineAdditionData` | ~400 KB | Scratch for batch inversion |
+| `point_schedule` | $n × 8$ bytes | Per-MSM point schedule |
+
+Buffers are allocated per-call for WASM compatibility. Memory scales with thread count during parallel execution.
+
+</details>
+
+## File Structure
+
+```
+scalar_multiplication/
+├── scalar_multiplication.hpp    # MSM class, data structures
+├── scalar_multiplication.cpp    # Core algorithm
+├── process_buckets.hpp/cpp      # Radix sort
+├── bitvector.hpp                # Bit vector for bucket tracking
+└── README.md                    # This file
+```
+
+## References
+
+1. Pippenger, N. (1976). "On the evaluation of powers and related problems"
+2. Bernstein, D.J. et al. "Faster batch forgery identification" (batch inversion)
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp
index a6e767961b44..c527b7b7b1b4 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.cpp
@@ -10,89 +10,97 @@
 
 namespace bb::scalar_multiplication {
 
-// NOLINTNEXTLINE(misc-no-recursion) recursion is fine here, max recursion depth is 8 (64 bit int / 8 bits per call)
+// NOLINTNEXTLINE(misc-no-recursion) recursion is fine here, max depth is 4 (32-bit bucket index / 8 bits per call)
 void radix_sort_count_zero_entries(uint64_t* keys,
                                    const size_t num_entries,
                                    const uint32_t shift,
                                    size_t& num_zero_entries,
-                                   const uint32_t total_bits,
-                                   const uint64_t* start_pointer) noexcept
+                                   const uint32_t bucket_index_bits,
+                                   const uint64_t* top_level_keys) noexcept
 {
-    constexpr size_t num_bits = 8;
-    constexpr size_t num_buckets = 1UL << num_bits;
-    constexpr uint32_t mask = static_cast<uint32_t>(num_buckets) - 1U;
-    std::array<uint32_t, num_buckets> bucket_counts{};
+    constexpr size_t NUM_RADIX_BUCKETS = 1UL << RADIX_BITS;
+    constexpr uint32_t RADIX_MASK = static_cast<uint32_t>(NUM_RADIX_BUCKETS) - 1U;
 
+    // Step 1: Count entries in each radix bucket
+    std::array<uint32_t, NUM_RADIX_BUCKETS> bucket_counts{};
     for (size_t i = 0; i < num_entries; ++i) {
-        bucket_counts[(keys[i] >> shift) & mask]++;
+        bucket_counts[(keys[i] >> shift) & RADIX_MASK]++;
     }
 
-    std::array<uint32_t, num_buckets + 1> offsets;
-    std::array<uint32_t, num_buckets + 1> offsets_copy;
+    // Step 2: Convert counts to cumulative offsets (prefix sum)
+    std::array<uint32_t, NUM_RADIX_BUCKETS + 1> offsets;
+    std::array<uint32_t, NUM_RADIX_BUCKETS + 1> offsets_copy;
     offsets[0] = 0;
-
-    for (size_t i = 0; i < num_buckets - 1; ++i) {
+    for (size_t i = 0; i < NUM_RADIX_BUCKETS - 1; ++i) {
         bucket_counts[i + 1] += bucket_counts[i];
     }
-    if ((shift == 0) && (keys == start_pointer)) {
+
+    // Count zero entries only at the final recursion level (shift == 0) and only for the full array
+    if ((shift == 0) && (keys == top_level_keys)) {
         num_zero_entries = bucket_counts[0];
     }
-    for (size_t i = 1; i < num_buckets + 1; ++i) {
+
+    for (size_t i = 1; i < NUM_RADIX_BUCKETS + 1; ++i) {
         offsets[i] = bucket_counts[i - 1];
     }
-    for (size_t i = 0; i < num_buckets + 1; ++i) {
+    for (size_t i = 0; i < NUM_RADIX_BUCKETS + 1; ++i) {
         offsets_copy[i] = offsets[i];
     }
-    uint64_t* start = &keys[0];
 
-    for (size_t i = 0; i < num_buckets; ++i) {
+    // Step 3: In-place permutation using cycle sort
+    // For each radix bucket, repeatedly swap elements to their correct positions until all elements
+    // in that bucket's range belong there. The offsets array tracks the next write position for each bucket.
+    uint64_t* start = &keys[0];
+    for (size_t i = 0; i < NUM_RADIX_BUCKETS; ++i) {
         uint64_t* bucket_start = &keys[offsets[i]];
         const uint64_t* bucket_end = &keys[offsets_copy[i + 1]];
         while (bucket_start != bucket_end) {
             for (uint64_t* it = bucket_start; it < bucket_end; ++it) {
-                const size_t value = (*it >> shift) & mask;
+                const size_t value = (*it >> shift) & RADIX_MASK;
                 const uint64_t offset = offsets[value]++;
                 std::iter_swap(it, start + offset);
             }
             bucket_start = &keys[offsets[i]];
         }
     }
+
+    // Step 4: Recursively sort each bucket by the next less-significant byte
     if (shift > 0) {
-        for (size_t i = 0; i < num_buckets; ++i) {
-            if (offsets_copy[i + 1] - offsets_copy[i] > 1) {
-                radix_sort_count_zero_entries(&keys[offsets_copy[i]],
-                                              offsets_copy[i + 1] - offsets_copy[i],
-                                              shift - 8,
-                                              num_zero_entries,
-                                              total_bits,
-                                              keys);
+        for (size_t i = 0; i < NUM_RADIX_BUCKETS; ++i) {
+            const size_t bucket_size = offsets_copy[i + 1] - offsets_copy[i];
+            if (bucket_size > 1) {
+                radix_sort_count_zero_entries(
+                    &keys[offsets_copy[i]], bucket_size, shift - RADIX_BITS, num_zero_entries, bucket_index_bits, keys);
             }
         }
     }
 }
 
-size_t process_buckets_count_zero_entries(uint64_t* wnaf_entries,
-                                          const size_t num_entries,
-                                          const uint32_t num_bits) noexcept
+size_t sort_point_schedule_and_count_zero_buckets(uint64_t* point_schedule,
+                                                  const size_t num_entries,
+                                                  const uint32_t bucket_index_bits) noexcept
 {
     if (num_entries == 0) {
         return 0;
     }
-    const uint32_t bits_per_round = 8;
-    const uint32_t base = num_bits & 7;
-    const uint32_t total_bits = (base == 0) ? num_bits : num_bits - base + 8;
-    const uint32_t shift = total_bits - bits_per_round;
+
+    // Round bucket_index_bits up to next multiple of RADIX_BITS for proper MSD radix sort alignment.
+    // E.g., if bucket_index_bits=10, we need to start sorting from bit 16 (2 bytes) not bit 10.
+    const uint32_t remainder = bucket_index_bits % RADIX_BITS;
+    const uint32_t padded_bits = (remainder == 0) ? bucket_index_bits : bucket_index_bits - remainder + RADIX_BITS;
+    const uint32_t initial_shift = padded_bits - RADIX_BITS;
+
     size_t num_zero_entries = 0;
-    radix_sort_count_zero_entries(wnaf_entries, num_entries, shift, num_zero_entries, num_bits, wnaf_entries);
-
-    // inside radix_sort_count_zero_entries, if the least significant *byte* of `wnaf_entries[0] == 0`,
-    // then num_nonzero_entries = number of entries that share the same value as wnaf_entries[0].
-    // If wnaf_entries[0] != 0, we must manually set num_zero_entries = 0
-    if (num_entries > 0) {
-        if ((wnaf_entries[0] & 0xffffffff) != 0) {
-            num_zero_entries = 0;
-        }
+    radix_sort_count_zero_entries(
+        point_schedule, num_entries, initial_shift, num_zero_entries, bucket_index_bits, point_schedule);
+
+    // The radix sort counts entries where the least significant BYTE is zero, but we need entries where
+    // the entire bucket_index (lower 32 bits) is zero. Verify the first entry after sorting.
+    if ((point_schedule[0] & BUCKET_INDEX_MASK) != 0) {
+        num_zero_entries = 0;
     }
+
     return num_zero_entries;
 }
+
 } // namespace bb::scalar_multiplication
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp
index e964fcc7adf0..7c5a8c68b881 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/process_buckets.hpp
@@ -10,13 +10,47 @@
 #include <cstdint>
 
 namespace bb::scalar_multiplication {
+
+// Number of bits processed per radix sort pass
+constexpr uint32_t RADIX_BITS = 8;
+
+// Mask for extracting bucket index from point schedule entry (lower 32 bits)
+constexpr uint64_t BUCKET_INDEX_MASK = 0xffffffff;
+
+/**
+ * @brief Recursive MSD radix sort that also counts entries with zero bucket index
+ * @details Sorts point schedule entries by their bucket index (lower 32 bits) using most-significant-digit
+ *          radix sort. Processes RADIX_BITS bits per recursive call, starting from the most significant byte.
+ *          As a side effect, counts entries where bucket_index == 0 (which are skipped during MSM).
+ *
+ * @param keys Array of point schedule entries to sort in-place (format: point_index << 32 | bucket_index)
+ * @param num_entries Number of entries to sort
+ * @param shift Current bit position to sort on (starts high, decreases by RADIX_BITS each recursion)
+ * @param[out] num_zero_entries Count of entries with bucket_index == 0 (only set at final recursion level)
+ * @param bucket_index_bits Number of bits in the bucket index (used to determine recursion depth)
+ * @param top_level_keys Pointer to original array start; used to detect top-level call for zero counting
+ */
 void radix_sort_count_zero_entries(uint64_t* keys,
-                                   const size_t num_entries,
-                                   const uint32_t shift,
+                                   size_t num_entries,
+                                   uint32_t shift,
                                    size_t& num_zero_entries,
-                                   const uint32_t total_bits,
-                                   const uint64_t* start_pointer) noexcept;
-size_t process_buckets_count_zero_entries(uint64_t* wnaf_entries,
-                                          const size_t num_entries,
-                                          const uint32_t num_bits) noexcept;
+                                   uint32_t bucket_index_bits,
+                                   const uint64_t* top_level_keys) noexcept;
+
+/**
+ * @brief Sort point schedule by bucket index and count zero-bucket entries
+ * @details Entry point for radix sort. Sorts entries so that points targeting the same bucket
+ *          are contiguous, improving cache locality during bucket accumulation. Also counts
+ *          entries with bucket_index == 0, which represent scalar bits that don't contribute
+ *          to any bucket in the current Pippenger round.
+ *
+ * @param point_schedule Array of packed entries (point_index << 32 | bucket_index) to sort in-place
+ * @param num_entries Number of entries
+ * @param bucket_index_bits Number of bits used for bucket indices (determines sort range)
+ * @return Number of entries with bucket_index == 0 (these are at the start after sorting)
+ */
+size_t sort_point_schedule_and_count_zero_buckets(uint64_t* point_schedule,
+                                                  size_t num_entries,
+                                                  uint32_t bucket_index_bits) noexcept;
+
 } // namespace bb::scalar_multiplication
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
index d4199c0e6cd4..7e73e8ddbe66 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
@@ -21,21 +21,14 @@
 
 namespace bb::scalar_multiplication {
 
-/**
- * @brief Fallback method for very small numbers of input points
- *
- * @tparam Curve
- * @param scalars
- * @param points
- * @param range
- * @return Curve::Element
- */
-template <typename Curve>
-typename Curve::Element small_mul(std::span<const typename Curve::ScalarField>& scalars,
-                                  std::span<const typename Curve::AffineElement>& points,
-                                  std::span<const uint32_t> scalar_indices,
-                                  size_t range) noexcept
+// Naive double-and-add fallback for small inputs (< PIPPENGER_THRESHOLD points).
+template <typename Curve> typename Curve::Element small_mul(const typename MSM<Curve>::MSMData& msm_data) noexcept
 {
+    const auto& scalars = msm_data.scalars;
+    const auto& points = msm_data.points;
+    const auto& scalar_indices = msm_data.scalar_indices;
+    const size_t range = scalar_indices.size();
+
     typename Curve::Element r = Curve::Group::point_at_infinity;
     for (size_t i = 0; i < range; ++i) {
         typename Curve::Element f = points[scalar_indices[i]];
@@ -44,21 +37,14 @@ typename Curve::Element small_mul(std::span<const typename Curve::ScalarField>&
     return r;
 }
 
-/**
- * @brief Convert scalar out of Montgomery form. Populate `consolidated_indices` with nonzero scalar indices
- *
- * @tparam Curve
- * @param scalars
- * @param consolidated_indices
- */
 template <typename Curve>
 void MSM<Curve>::transform_scalar_and_get_nonzero_scalar_indices(std::span<typename Curve::ScalarField> scalars,
-                                                                 std::vector<uint32_t>& consolidated_indices) noexcept
+                                                                 std::vector<uint32_t>& nonzero_scalar_indices) noexcept
 {
     std::vector<std::vector<uint32_t>> thread_indices(get_num_cpus());
 
+    // Pass 1: Each thread converts from Montgomery and collects nonzero indices into its own vector
     parallel_for([&](const ThreadChunk& chunk) {
-        // parallel_for with ThreadChunk uses get_num_cpus() threads
         BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());
         auto range = chunk.range(scalars.size());
         if (range.empty()) {
@@ -71,9 +57,7 @@ void MSM<Curve>::transform_scalar_and_get_nonzero_scalar_indices(std::span<typen
             auto& scalar = scalars[i];
             scalar.self_from_montgomery_form();
 
-            bool is_zero =
-                (scalar.data[0] == 0) && (scalar.data[1] == 0) && (scalar.data[2] == 0) && (scalar.data[3] == 0);
-            if (!is_zero) {
+            if (!scalar.is_zero()) {
                 thread_scalar_indices.push_back(static_cast<uint32_t>(i));
             }
         }
@@ -83,32 +67,21 @@ void MSM<Curve>::transform_scalar_and_get_nonzero_scalar_indices(std::span<typen
     for (const auto& indices : thread_indices) {
         num_entries += indices.size();
     }
-    consolidated_indices.resize(num_entries);
+    nonzero_scalar_indices.resize(num_entries);
 
+    // Pass 2: Copy each thread's indices to the output vector (no branching)
     parallel_for([&](const ThreadChunk& chunk) {
-        // parallel_for with ThreadChunk uses get_num_cpus() threads
         BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());
         size_t offset = 0;
         for (size_t i = 0; i < chunk.thread_index; ++i) {
             offset += thread_indices[i].size();
         }
         for (size_t i = offset; i < offset + thread_indices[chunk.thread_index].size(); ++i) {
-            consolidated_indices[i] = thread_indices[chunk.thread_index][i - offset];
+            nonzero_scalar_indices[i] = thread_indices[chunk.thread_index][i - offset];
         }
     });
 }
 
-/**
- * @brief Split a multiple multi-scalar-multiplication into equal units of work that can be processed by threads
- * @details The goal is to compute the total number of multiplications needed, and assign each thread a set of MSMs
- *          such that each thread performs equivalent work.
- *          We will split up an MSM into multiple MSMs if this is required.
- *
- * @tparam Curve
- * @param scalars
- * @param msm_scalar_indices
- * @return std::vector<typename MSM<Curve>::ThreadWorkUnits>
- */
 template <typename Curve>
 std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::get_work_units(
     std::span<std::span<ScalarField>> scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept
@@ -117,7 +90,6 @@ std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::get_work_units(
     const size_t num_msms = scalars.size();
     msm_scalar_indices.resize(num_msms);
     for (size_t i = 0; i < num_msms; ++i) {
-        BB_ASSERT_LT(i, scalars.size());
         transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
     }
 
@@ -130,12 +102,9 @@ std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::get_work_units(
     std::vector<ThreadWorkUnits> work_units(num_threads);
 
     const size_t work_per_thread = numeric::ceil_div(total_work, num_threads);
-    size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
+    const size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
 
-    // [(MSMs + T - 1) / T] * [T - 1] > MSMs
-    // T = 192
-    // ([M + 191] / 192) * 193 > M
-    // only use a single work unit if we don't have enough work for every thread
+    // Only use a single work unit if we don't have enough work for every thread
     if (num_threads > total_work) {
         for (size_t i = 0; i < num_msms; ++i) {
             work_units[0].push_back(MSMWorkUnit{
@@ -150,31 +119,28 @@ std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::get_work_units(
     size_t thread_accumulated_work = 0;
     size_t current_thread_idx = 0;
     for (size_t i = 0; i < num_msms; ++i) {
-        BB_ASSERT_DEBUG(i < msm_scalar_indices.size());
-        size_t msm_work = msm_scalar_indices[i].size();
-        size_t msm_size = msm_work;
-        while (msm_work > 0) {
+        size_t msm_work_remaining = msm_scalar_indices[i].size();
+        const size_t initial_msm_work = msm_work_remaining;
+
+        while (msm_work_remaining > 0) {
+            BB_ASSERT_LT(current_thread_idx, work_units.size());
+
             const size_t total_thread_work =
                 (current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread;
             const size_t available_thread_work = total_thread_work - thread_accumulated_work;
+            const size_t work_to_assign = std::min(available_thread_work, msm_work_remaining);
 
-            if (available_thread_work >= msm_work) {
-                BB_ASSERT_LT(current_thread_idx, work_units.size());
-                work_units[current_thread_idx].push_back(MSMWorkUnit{
-                    .batch_msm_index = i,
-                    .start_index = msm_size - msm_work,
-                    .size = msm_work,
-                });
-                thread_accumulated_work += msm_work;
-                msm_work = 0;
-            } else {
-                BB_ASSERT_LT(current_thread_idx, work_units.size());
-                work_units[current_thread_idx].push_back(MSMWorkUnit{
-                    .batch_msm_index = i,
-                    .start_index = msm_size - msm_work,
-                    .size = available_thread_work,
-                });
-                msm_work -= available_thread_work;
+            work_units[current_thread_idx].push_back(MSMWorkUnit{
+                .batch_msm_index = i,
+                .start_index = initial_msm_work - msm_work_remaining,
+                .size = work_to_assign,
+            });
+
+            thread_accumulated_work += work_to_assign;
+            msm_work_remaining -= work_to_assign;
+
+            // Move to next thread if current thread is full
+            if (thread_accumulated_work >= total_thread_work) {
                 current_thread_idx++;
                 thread_accumulated_work = 0;
             }
@@ -184,142 +150,91 @@ std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::get_work_units(
 }
 
 /**
- * @brief Given a scalar that is *NOT* in Montgomery form, extract a `slice_size`-bit chunk
- * @brief At round i, we extract `slice_size * (i-1)` to `slice_sice * i` most significant bits.
+ * @brief Extract a slice of bits from a scalar for Pippenger bucket assignment
+ * @details Extracts bits [lo_bit, hi_bit) from the scalar's raw limb representation.
+ *          The scalar must already be converted out of Montgomery form.
+ *
+ * IMPORTANT RESTRICTIONS (optimized for Pippenger's specific usage pattern):
+ * - slice_size must be <= 32 bits (returns uint32_t)
+ * - The bit range must span at most 2 limbs (satisfied when slice_size <= 64)
+ * - hi_bit must be < 256 to avoid out-of-bounds access (satisfied since hi_bit <= NUM_BITS_IN_FIELD < 256)
  *
- * @tparam Curve
- * @param scalar
- * @param round
- * @param normal_slice_size
- * @return uint32_t
+ * @param scalar The scalar field element (must be in non-Montgomery form)
+ * @param round The current Pippenger round (0 = most significant bits)
+ * @param slice_size Number of bits per slice
+ * @return uint32_t The bucket index for this round
  */
 template <typename Curve>
 uint32_t MSM<Curve>::get_scalar_slice(const typename Curve::ScalarField& scalar,
                                       size_t round,
                                       size_t slice_size) noexcept
 {
+    constexpr size_t LIMB_BITS = 64;
+
     size_t hi_bit = NUM_BITS_IN_FIELD - (round * slice_size);
-    // todo remove
-    bool last_slice = hi_bit < slice_size;
-    size_t target_slice_size = last_slice ? hi_bit : slice_size;
-    size_t lo_bit = last_slice ? 0 : hi_bit - slice_size;
-    size_t start_limb = lo_bit / 64;
-    size_t end_limb = hi_bit / 64;
-    size_t lo_slice_offset = lo_bit & 63;
-    size_t lo_slice_bits = std::min(target_slice_size, 64 - lo_slice_offset);
-    size_t hi_slice_bits = target_slice_size - lo_slice_bits;
-    size_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((static_cast<size_t>(1) << lo_slice_bits) - 1);
-    size_t hi_slice = (scalar.data[end_limb] & ((static_cast<size_t>(1) << hi_slice_bits) - 1));
-
-    uint32_t lo = static_cast<uint32_t>(lo_slice);
-    uint32_t hi = static_cast<uint32_t>(hi_slice);
-
-    uint32_t result = lo + (hi << lo_slice_bits);
-    return result;
+    size_t lo_bit = (hi_bit < slice_size) ? 0 : hi_bit - slice_size;
+
+    BB_ASSERT_DEBUG(lo_bit < hi_bit);
+    BB_ASSERT_DEBUG(hi_bit <= NUM_BITS_IN_FIELD); // Ensures hi_bit < 256, so end_limb <= 3
+
+    size_t start_limb = lo_bit / LIMB_BITS;
+    size_t end_limb = hi_bit / LIMB_BITS;
+    size_t lo_slice_offset = lo_bit & (LIMB_BITS - 1);
+    size_t actual_slice_size = hi_bit - lo_bit;
+    size_t lo_slice_bits =
+        (LIMB_BITS - lo_slice_offset < actual_slice_size) ? (LIMB_BITS - lo_slice_offset) : actual_slice_size;
+    size_t hi_slice_bits = actual_slice_size - lo_slice_bits;
+
+    uint64_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((1ULL << lo_slice_bits) - 1);
+    uint64_t hi_slice = (start_limb != end_limb) ? (scalar.data[end_limb] & ((1ULL << hi_slice_bits) - 1)) : 0;
+
+    return static_cast<uint32_t>(lo_slice | (hi_slice << lo_slice_bits));
 }
 
-/**
- * @brief For a given number of points, compute the optimal Pippenger bucket size
- *
- * @tparam Curve
- * @param num_points
- * @return constexpr size_t
- */
-template <typename Curve> size_t MSM<Curve>::get_optimal_log_num_buckets(const size_t num_points) noexcept
+template <typename Curve> uint32_t MSM<Curve>::get_optimal_log_num_buckets(const size_t num_points) noexcept
 {
-    // We do 2 group operations per bucket, and they are full 3D Jacobian adds which are ~2x more than an affine add
-    constexpr size_t COST_OF_BUCKET_OP_RELATIVE_TO_POINT = 5;
-    size_t cached_cost = static_cast<size_t>(-1);
-    size_t target_bit_slice = 0;
-    for (size_t bit_slice = 1; bit_slice < 20; ++bit_slice) {
-        const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bit_slice);
-        const size_t num_buckets = 1 << bit_slice;
-        const size_t addition_cost = num_rounds * num_points;
-        const size_t bucket_cost = num_rounds * num_buckets * COST_OF_BUCKET_OP_RELATIVE_TO_POINT;
-        const size_t total_cost = addition_cost + bucket_cost;
-        if (total_cost < cached_cost) {
-            cached_cost = total_cost;
-            target_bit_slice = bit_slice;
+    // Cost model: total_cost = num_rounds * (num_points + num_buckets * BUCKET_ACCUMULATION_COST)
+    auto compute_cost = [&](uint32_t bits) {
+        size_t rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, static_cast<size_t>(bits));
+        size_t buckets = size_t{ 1 } << bits;
+        return rounds * (num_points + buckets * BUCKET_ACCUMULATION_COST);
+    };
+
+    uint32_t best_bits = 1;
+    size_t best_cost = compute_cost(1);
+    for (uint32_t bits = 2; bits < MAX_SLICE_BITS; ++bits) {
+        size_t cost = compute_cost(bits);
+        if (cost < best_cost) {
+            best_cost = cost;
+            best_bits = bits;
         }
     }
-    return target_bit_slice;
+    return best_bits;
 }
 
-/**
- * @brief Given a number of points and an optimal bucket size, should we use the affine trick?
- *
- * @tparam Curve
- * @param num_points
- * @param num_buckets
- * @return true
- * @return false
- */
 template <typename Curve> bool MSM<Curve>::use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept
 {
-    if (num_points < 128) {
+    if (num_points < AFFINE_TRICK_THRESHOLD) {
         return false;
     }
 
     // Affine trick requires log(N) modular inversions per Pippenger round.
-    // It saves NUM_POINTS * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION field muls
-    // It also saves NUM_BUCKETS * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 field muls
-    // due to all our buckets having Z=1 if we use the affine trick
-
-    // COST_OF_INVERSION cost:
-    // Requires NUM_BITS_IN_FIELD sqarings
-    // We use 4-bit windows = ((NUM_BITS_IN_FIELD + 3) / 4) multiplications
-    // Computing 4-bit window table requires 14 muls
-    constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + 14;
-    constexpr size_t COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION = 5;
-    constexpr size_t EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 = 5;
-
-    double num_points_f = static_cast<double>(num_points);
-    double log2_num_points_f = log2(num_points_f);
-
-    size_t group_op_cost_saving_per_round = (num_points * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION) +
-                                            (num_buckets * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1);
-    double inversion_cost_per_round = log2_num_points_f * static_cast<double>(COST_OF_INVERSION);
-
-    return static_cast<double>(group_op_cost_saving_per_round) > inversion_cost_per_round;
+    // It saves num_points * AFFINE_TRICK_SAVINGS_PER_OP field muls, plus
+    // num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY field muls (buckets have Z=1 with affine trick)
+
+    // Cost of modular inversion via exponentiation:
+    // - NUM_BITS_IN_FIELD squarings
+    // - (NUM_BITS_IN_FIELD + 3) / 4 multiplications (4-bit windows)
+    // - INVERSION_TABLE_COST multiplications for lookup table
+    constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + INVERSION_TABLE_COST;
+
+    double log2_num_points = log2(static_cast<double>(num_points));
+    size_t savings_per_round = (num_points * AFFINE_TRICK_SAVINGS_PER_OP) + (num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY);
+    double inversion_cost_per_round = log2_num_points * static_cast<double>(COST_OF_INVERSION);
+
+    return static_cast<double>(savings_per_round) > inversion_cost_per_round;
 }
 
-/**
- * @brief adds a bunch of points together using affine addition formulae.
- * @details Paradoxically, the affine formula is crazy efficient if you have a lot of independent point additions to
- * perform. Affine formula:
- *
- * \lambda = (y_2 - y_1) / (x_2 - x_1)
- * x_3 = \lambda^2 - (x_2 + x_1)
- * y_3 = \lambda*(x_1 - x_3) - y_1
- *
- * Traditionally, we avoid affine formulae like the plague, because computing lambda requires a modular inverse,
- * which is outrageously expensive.
- *
- * However! We can use Montgomery's batch inversion technique to amortise the cost of the inversion to ~0.
- *
- * The way batch inversion works is as follows. Let's say you want to compute \{ 1/x_1, 1/x_2, ..., 1/x_n \}
- * The trick is to compute the product x_1x_2...x_n , whilst storing all of the temporary products.
- * i.e. we have an array A = [x_1, x_1x_2, ..., x_1x_2...x_n]
- * We then compute a single inverse: I = 1 / x_1x_2...x_n
- * Finally, we can use our accumulated products, to quotient out individual inverses.
- * We can get an individual inverse at index i, by computing I.A_{i-1}.(x_nx_n-1...x_i+1)
- * The last product term we can compute on-the-fly, as it grows by one element for each additional inverse that we
- * require.
- *
- * TLDR: amortized cost of a modular inverse is 3 field multiplications per inverse.
- * Which means we can compute a point addition with SIX field multiplications in total.
- * The traditional Jacobian-coordinate formula requires 11.
- *
- * There is a catch though - we need large sequences of independent point additions!
- * i.e. the output from one point addition in the sequence is NOT an input to any other point addition in the
- * sequence.
- *
- * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated
- * @tparam Curve
- * @param points points to be added pairwise; result is stored in the latter half of the array
- * @param num_points
- * @param scratch_space coordinate field scratch space needed for batched inversion
- **/
 template <typename Curve>
 void MSM<Curve>::add_affine_points(typename Curve::AffineElement* points,
                                    const size_t num_points,
@@ -328,12 +243,14 @@ void MSM<Curve>::add_affine_points(typename Curve::AffineElement* points,
     using Fq = typename Curve::BaseField;
     Fq batch_inversion_accumulator = Fq::one();
 
+    // Forward pass: prepare batch inversion inputs.
+    // We reuse points[i+1] storage: .x stores (x2-x1), .y stores (y2-y1)*accumulator
     for (size_t i = 0; i < num_points; i += 2) {
-        scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
-        points[i + 1].x -= points[i].x;                        // x2 - x1
-        points[i + 1].y -= points[i].y;                        // y2 - y1
+        scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 (needed later for x3)
+        points[i + 1].x -= points[i].x;                        // x2 - x1 (denominator for lambda)
+        points[i + 1].y -= points[i].y;                        // y2 - y1 (numerator for lambda)
         points[i + 1].y *= batch_inversion_accumulator;        // (y2 - y1)*accumulator_old
-        batch_inversion_accumulator *= (points[i + 1].x);
+        batch_inversion_accumulator *= (points[i + 1].x);      // accumulate denominators
     }
     if (batch_inversion_accumulator == 0) {
         // prefer abort to throw for code that might emit from multiple threads
@@ -342,421 +259,215 @@ void MSM<Curve>::add_affine_points(typename Curve::AffineElement* points,
         batch_inversion_accumulator = batch_inversion_accumulator.invert();
     }
 
-    // Iterate backwards through the points, comnputing pairwise affine additions; addition results are stored in the
-    // latter half of the array
+    // Backward pass: compute additions using batch inversion results.
+    // Reusing points[i+1] storage: .y becomes lambda, .x becomes lambda^2.
+    // Results are written to the top half of points array: points[(i+num_points)/2].
+    // Loop terminates when i underflows (becomes > num_points for unsigned).
     for (size_t i = (num_points)-2; i < num_points; i -= 2) {
-        points[i + 1].y *= batch_inversion_accumulator; // update accumulator
-        batch_inversion_accumulator *= points[i + 1].x;
-        points[i + 1].x = points[i + 1].y.sqr();
-        points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2
-                                                                                     // - x1
-        // Memory bandwidth is a bit of a bottleneck here.
-        // There's probably a more elegant way of structuring our data so we don't need to do all of this
-        // prefetching
+        points[i + 1].y *= batch_inversion_accumulator; // .y now holds lambda = (y2-y1)/(x2-x1)
+        batch_inversion_accumulator *= points[i + 1].x; // restore accumulator for next iteration
+        points[i + 1].x = points[i + 1].y.sqr();        // .x now holds lambda^2
+        points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda^2 - x2 - x1
+        // Output addresses jump non-sequentially: points[(i+n)>>1] defeats hardware prefetcher.
+        // Fetching 2 iterations ahead ensures data arrives before needed.
         if (i >= 2) {
             __builtin_prefetch(points + i - 2);
             __builtin_prefetch(points + i - 1);
             __builtin_prefetch(points + ((i + num_points - 2) >> 1));
             __builtin_prefetch(scratch_space + ((i - 2) >> 1));
         }
-        points[i].x -= points[(i + num_points) >> 1].x;
-        points[i].x *= points[i + 1].y;
-        points[(i + num_points) >> 1].y = points[i].x - points[i].y;
+        // Compute y3 = lambda * (x1 - x3) - y1, reusing points[i].x as temp storage
+        points[i].x -= points[(i + num_points) >> 1].x;              // x1 - x3
+        points[i].x *= points[i + 1].y;                              // lambda * (x1 - x3)
+        points[(i + num_points) >> 1].y = points[i].x - points[i].y; // y3 = lambda*(x1-x3) - y1
     }
 }
 
-/**
- * @brief Top-level Pippenger algorithm where number of points is small and we are not using the Affine trick
- *
- * @tparam Curve
- * @param msm_data
- * @return Curve::AffineElement
- */
 template <typename Curve>
-typename Curve::Element MSM<Curve>::small_pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept
+typename Curve::Element MSM<Curve>::jacobian_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept
 {
-    std::span<const uint32_t>& nonzero_scalar_indices = msm_data.scalar_indices;
-    const size_t size = nonzero_scalar_indices.size();
-    const size_t bits_per_slice = get_optimal_log_num_buckets(size);
-    const size_t num_buckets = 1 << bits_per_slice;
-    JacobianBucketAccumulators bucket_data = JacobianBucketAccumulators(num_buckets);
-    Element round_output = Curve::Group::point_at_infinity;
+    const size_t size = msm_data.scalar_indices.size();
+    const uint32_t bits_per_slice = get_optimal_log_num_buckets(size);
+    const size_t num_buckets = size_t{ 1 } << bits_per_slice;
+    const uint32_t num_rounds = static_cast<uint32_t>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
+    const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;
+
+    JacobianBucketAccumulators bucket_data(num_buckets);
+    Element msm_result = Curve::Group::point_at_infinity;
+
+    for (uint32_t round = 0; round < num_rounds; ++round) {
+        // Populate buckets using Jacobian accumulation
+        for (size_t i = 0; i < size; ++i) {
+            uint32_t idx = msm_data.scalar_indices[i];
+            uint32_t bucket = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);
+            if (bucket > 0) {
+                if (bucket_data.bucket_exists.get(bucket)) {
+                    bucket_data.buckets[bucket] += msm_data.points[idx];
+                } else {
+                    bucket_data.buckets[bucket] = msm_data.points[idx];
+                    bucket_data.bucket_exists.set(bucket, true);
+                }
+            }
+        }
 
-    const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
+        // Reduce buckets and accumulate into result
+        Element bucket_result = accumulate_buckets(bucket_data);
+        bucket_data.bucket_exists.clear();
 
-    for (size_t i = 0; i < num_rounds; ++i) {
-        round_output = evaluate_small_pippenger_round(msm_data, i, bucket_data, round_output, bits_per_slice);
+        uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;
+        for (uint32_t i = 0; i < num_doublings; ++i) {
+            msm_result.self_dbl();
+        }
+        msm_result += bucket_result;
     }
-    return round_output;
+    return msm_result;
 }
 
-/**
- * @brief Top-level Pippenger algorithm
- *
- * @tparam Curve
- * @param msm_data
- * @return Curve::AffineElement
- */
 template <typename Curve>
-typename Curve::Element MSM<Curve>::pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept
+typename Curve::Element MSM<Curve>::affine_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept
 {
-    const size_t msm_size = msm_data.scalar_indices.size();
-    const size_t bits_per_slice = get_optimal_log_num_buckets(msm_size);
-    const size_t num_buckets = 1 << bits_per_slice;
+    const size_t num_points = msm_data.scalar_indices.size();
+    const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points);
+    const size_t num_buckets = size_t{ 1 } << bits_per_slice;
 
-    if (!use_affine_trick(msm_size, num_buckets)) {
-        return small_pippenger_low_memory_with_transformed_scalars(msm_data);
+    if (!use_affine_trick(num_points, num_buckets)) {
+        return jacobian_pippenger_with_transformed_scalars(msm_data);
     }
-    // TODO(https://github.com/AztecProtocol/barretenberg/issues/1452): Consider allowing this memory to persist rather
-    // than allocating/deallocating on every execution.
-    AffineAdditionData affine_data = AffineAdditionData();
-    BucketAccumulators bucket_data = BucketAccumulators(num_buckets);
 
-    Element round_output = Curve::Group::point_at_infinity;
+    const uint32_t num_rounds = static_cast<uint32_t>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
+    const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;
 
-    const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
-    for (size_t i = 0; i < num_rounds; ++i) {
-        round_output = evaluate_pippenger_round(msm_data, i, affine_data, bucket_data, round_output, bits_per_slice);
-    }
+    // Per-call allocation for WASM compatibility (thread_local causes issues in WASM)
+    AffineAdditionData affine_data;
+    BucketAccumulators bucket_data(num_buckets);
 
-    return (round_output);
-}
+    Element msm_result = Curve::Group::point_at_infinity;
 
-/**
- * @brief Evaluate a single Pippenger round when we do not use the Affine trick
- *
- * @tparam Curve
- * @param msm_data
- * @param round_index
- * @param bucket_data
- * @param previous_round_output
- * @param bits_per_slice
- * @return Curve::Element
- */
-template <typename Curve>
-typename Curve::Element MSM<Curve>::evaluate_small_pippenger_round(MSMData& msm_data,
-                                                                   const size_t round_index,
-                                                                   MSM<Curve>::JacobianBucketAccumulators& bucket_data,
-                                                                   typename Curve::Element previous_round_output,
-                                                                   const size_t bits_per_slice) noexcept
-{
-    std::span<const uint32_t>& nonzero_scalar_indices = msm_data.scalar_indices;
-    std::span<const ScalarField>& scalars = msm_data.scalars;
-    std::span<const AffineElement>& points = msm_data.points;
-
-    const size_t size = nonzero_scalar_indices.size();
-    for (size_t i = 0; i < size; ++i) {
-        BB_ASSERT_DEBUG(nonzero_scalar_indices[i] < scalars.size());
-        uint32_t bucket_index = get_scalar_slice(scalars[nonzero_scalar_indices[i]], round_index, bits_per_slice);
-        BB_ASSERT_DEBUG(bucket_index < static_cast<uint32_t>(1 << bits_per_slice));
-        if (bucket_index > 0) {
-            // do this check because we do not reset bucket_data.buckets after each round
-            // (i.e. not neccessarily at infinity)
-            if (bucket_data.bucket_exists.get(bucket_index)) {
-                bucket_data.buckets[bucket_index] += points[nonzero_scalar_indices[i]];
-            } else {
-                bucket_data.buckets[bucket_index] = points[nonzero_scalar_indices[i]];
-                bucket_data.bucket_exists.set(bucket_index, true);
+    for (uint32_t round = 0; round < num_rounds; ++round) {
+        // Build point schedule for this round
+        {
+            for (size_t i = 0; i < num_points; ++i) {
+                uint32_t idx = msm_data.scalar_indices[i];
+                uint32_t bucket_idx = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);
+                msm_data.point_schedule[i] = PointScheduleEntry::create(idx, bucket_idx).data;
             }
         }
-    }
-    Element round_output;
-    round_output.self_set_infinity();
-    round_output = accumulate_buckets(bucket_data);
-    bucket_data.bucket_exists.clear();
-    Element result = previous_round_output;
-    const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
-    size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
-                               ? NUM_BITS_IN_FIELD % bits_per_slice
-                               : bits_per_slice;
-    for (size_t i = 0; i < num_doublings; ++i) {
-        result.self_dbl();
-    }
 
-    result += round_output;
-    return result;
-}
-
-/**
- * @brief Evaluate a single Pippenger round where we use the affine trick
- *
- * @tparam Curve
- * @param msm_data
- * @param round_index
- * @param affine_data
- * @param bucket_data
- * @param previous_round_output
- * @param bits_per_slice
- * @return Curve::Element
- */
-template <typename Curve>
-typename Curve::Element MSM<Curve>::evaluate_pippenger_round(MSMData& msm_data,
-                                                             const size_t round_index,
-                                                             MSM<Curve>::AffineAdditionData& affine_data,
-                                                             MSM<Curve>::BucketAccumulators& bucket_data,
-                                                             typename Curve::Element previous_round_output,
-                                                             const size_t bits_per_slice) noexcept
-{
-    std::span<const uint32_t>& scalar_indices = msm_data.scalar_indices; // indices of nonzero scalars
-    std::span<const ScalarField>& scalars = msm_data.scalars;
-    std::span<const AffineElement>& points = msm_data.points;
-    std::span<uint64_t>& round_schedule = msm_data.point_schedule;
-    const size_t size = scalar_indices.size();
-
-    // Construct a "round schedule". Each entry describes:
-    // 1. low 32 bits: which bucket index do we add the point into? (bucket index = slice value)
-    // 2. high 32 bits: which point index do we source the point from?
-    for (size_t i = 0; i < size; ++i) {
-        BB_ASSERT_DEBUG(scalar_indices[i] < scalars.size());
-        round_schedule[i] = get_scalar_slice(scalars[scalar_indices[i]], round_index, bits_per_slice);
-        round_schedule[i] += (static_cast<uint64_t>(scalar_indices[i]) << 32ULL);
-    }
-    // Sort our point schedules based on their bucket values. Reduces memory throughput in next step of algo
-    const size_t num_zero_entries = scalar_multiplication::process_buckets_count_zero_entries(
-        &round_schedule[0], size, static_cast<uint32_t>(bits_per_slice));
-    BB_ASSERT_DEBUG(num_zero_entries <= size);
-    const size_t round_size = size - num_zero_entries;
-
-    Element round_output;
-    round_output.self_set_infinity();
-
-    if (round_size > 0) {
-        std::span<uint64_t> point_schedule(&round_schedule[num_zero_entries], round_size);
-        // Iterate through our point schedule and add points into corresponding buckets
-        consume_point_schedule(point_schedule, points, affine_data, bucket_data, 0, 0);
-        round_output = accumulate_buckets(bucket_data);
-        bucket_data.bucket_exists.clear();
-    }
+        // Sort by bucket and count zero-bucket entries
+        size_t num_zero_bucket_entries =
+            sort_point_schedule_and_count_zero_buckets(&msm_data.point_schedule[0], num_points, bits_per_slice);
+        size_t round_size = num_points - num_zero_bucket_entries;
+
+        // Accumulate points into buckets
+        Element bucket_result = Curve::Group::point_at_infinity;
+        if (round_size > 0) {
+            std::span<uint64_t> schedule(&msm_data.point_schedule[num_zero_bucket_entries], round_size);
+            batch_accumulate_points_into_buckets(schedule, msm_data.points, affine_data, bucket_data);
+            bucket_result = accumulate_buckets(bucket_data);
+            bucket_data.bucket_exists.clear();
+        }
 
-    Element result = previous_round_output;
-    const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
-    size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
-                               ? NUM_BITS_IN_FIELD % bits_per_slice
-                               : bits_per_slice;
-    for (size_t i = 0; i < num_doublings; ++i) {
-        result.self_dbl();
+        // Combine into running result
+        uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;
+        for (uint32_t i = 0; i < num_doublings; ++i) {
+            msm_result.self_dbl();
+        }
+        msm_result += bucket_result;
     }
 
-    result += round_output;
-    return result;
+    return msm_result;
 }
 
-/**
- * @brief Given a list of points and target buckets to add into, perform required group operations
- * @details This algorithm uses exclusively affine group operations, using batch inversions to amortise costs
- *
- * @tparam Curve
- * @param point_schedule
- * @param points
- * @param affine_data
- * @param bucket_data
- * @param num_input_points_processed
- * @param num_queued_affine_points
- */
 template <typename Curve>
-void MSM<Curve>::consume_point_schedule(std::span<const uint64_t> point_schedule,
-                                        std::span<const typename Curve::AffineElement> points,
-                                        MSM<Curve>::AffineAdditionData& affine_data,
-                                        MSM<Curve>::BucketAccumulators& bucket_data,
-                                        size_t num_input_points_processed,
-                                        size_t num_queued_affine_points) noexcept
+void MSM<Curve>::batch_accumulate_points_into_buckets(std::span<const uint64_t> point_schedule,
+                                                      std::span<const typename Curve::AffineElement> points,
+                                                      MSM<Curve>::AffineAdditionData& affine_data,
+                                                      MSM<Curve>::BucketAccumulators& bucket_data) noexcept
 {
 
-    size_t point_it = num_input_points_processed;
-    size_t affine_input_it = num_queued_affine_points;
-    // N.B. points and point_schedule MAY HAVE DIFFERENT SIZES
-    // We source the number of actual points we work on from the point schedule
-    size_t num_points = point_schedule.size();
-    auto& bucket_accumulator_exists = bucket_data.bucket_exists;
-    auto& affine_addition_scratch_space = affine_data.points_to_add;
-    auto& bucket_accumulators = bucket_data.buckets;
-    auto& affine_addition_output_bucket_destinations = affine_data.addition_result_bucket_destinations;
-    auto& scalar_scratch_space = affine_data.scalar_scratch_space;
-    auto& output_point_schedule = affine_data.addition_result_bucket_destinations;
-    AffineElement null_location{};
-    // We do memory prefetching, `prefetch_max` ensures we do not overflow our containers
-    size_t prefetch_max = (num_points - 32);
-    if (num_points < 32) {
-        prefetch_max = 0;
-    }
-    size_t end = num_points - 1;
-    if (num_points == 0) {
-        end = 0;
-    }
+    if (point_schedule.empty()) {
+        return;
+    }
+
+    size_t point_it = 0;
+    size_t scratch_it = 0;
+    const size_t num_points = point_schedule.size();
+    const size_t prefetch_max = (num_points >= PREFETCH_LOOKAHEAD) ? (num_points - PREFETCH_LOOKAHEAD) : 0;
+    const size_t last_index = num_points - 1;
+
+    // Iterative loop - continues until all points processed and no work remains in scratch space
+    while (point_it < num_points || scratch_it != 0) {
+        // Step 1: Fill scratch space with up to BATCH_SIZE/2 independent additions
+        while (((scratch_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < last_index)) {
+            // Prefetch points we'll need soon (every PREFETCH_INTERVAL iterations)
+            if ((point_it < prefetch_max) && ((point_it & PREFETCH_INTERVAL_MASK) == 0)) {
+                for (size_t i = PREFETCH_LOOKAHEAD / 2; i < PREFETCH_LOOKAHEAD; ++i) {
+                    PointScheduleEntry entry{ point_schedule[point_it + i] };
+                    __builtin_prefetch(&points[entry.point_index()]);
+                }
+            }
+
+            PointScheduleEntry lhs{ point_schedule[point_it] };
+            PointScheduleEntry rhs{ point_schedule[point_it + 1] };
+
+            process_bucket_pair(lhs.bucket_index(),
+                                rhs.bucket_index(),
+                                &points[lhs.point_index()],
+                                &points[rhs.point_index()],
+                                affine_data,
+                                bucket_data,
+                                scratch_it,
+                                point_it);
+        }
 
-    // Step 1: Fill up `affine_addition_scratch_space` with up to AffineAdditionData::BATCH_SIZE/2 independent additions
-    while (((affine_input_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < end)) {
+        // Handle the last point (odd count case) - separate to avoid bounds check on point_schedule[point_it + 1]
+        if (point_it == last_index) {
+            PointScheduleEntry last{ point_schedule[point_it] };
+            process_single_point(
+                last.bucket_index(), &points[last.point_index()], affine_data, bucket_data, scratch_it, point_it);
+        }
 
-        // we prefetchin'
-        if ((point_it < prefetch_max) && ((point_it & 0x0f) == 0)) {
-            for (size_t i = 16; i < 32; ++i) {
-                __builtin_prefetch(&points[(point_schedule[point_it + i] >> 32ULL)]);
-            }
+        // Compute independent additions using Montgomery's batch inversion trick
+        size_t num_points_to_add = scratch_it;
+        if (num_points_to_add >= 2) {
+            add_affine_points(
+                affine_data.points_to_add.data(), num_points_to_add, affine_data.inversion_scratch_space.data());
         }
 
-        // We do some branchless programming here to minimize instruction pipeline flushes
-        // TODO(@zac-williamson, cc @ludamad) check these ternary operators are not branching! -> (ludamad: they don't,
-        // but its not clear that the conditional move is fundamentally less expensive)
-        // We are iterating through our points and
-        // can come across the following scenarios: 1: The next 2 points in `point_schedule` belong to the *same* bucket
-        //    (happy path - can put both points into affine_addition_scratch_space)
-        // 2: The next 2 points have different bucket destinations AND point_schedule[point_it].bucket contains a point
-        //    (happyish path - we can put points[lhs_schedule] and buckets[lhs_bucket] into
-        //    affine_addition_scratch_space)
-        // 3: The next 2 points have different bucket destionations AND point_schedule[point_it].bucket is empty
-        //    We cache points[lhs_schedule] into buckets[lhs_bucket]
-        // We iterate `point_it` by 2 (case 1), or by 1 (case 2 or 3). The number of points we add into
-        // `affine_addition_scratch_space` is 2 (case 1 or 2) or 0 (case 3).
-        uint64_t lhs_schedule = point_schedule[point_it];
-        uint64_t rhs_schedule = point_schedule[point_it + 1];
-        size_t lhs_bucket = static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
-        size_t rhs_bucket = static_cast<size_t>(rhs_schedule) & 0xFFFFFFFF;
-        size_t lhs_point = static_cast<size_t>(lhs_schedule >> 32);
-        size_t rhs_point = static_cast<size_t>(rhs_schedule >> 32);
-
-        bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
-        bool buckets_match = lhs_bucket == rhs_bucket;
-        bool do_affine_add = buckets_match || has_bucket_accumulator;
-
-        const AffineElement* lhs_source = &points[lhs_point];
-        const AffineElement* rhs_source = buckets_match ? &points[rhs_point] : &bucket_accumulators[lhs_bucket];
-
-        // either two points are set to be added (point to point or point into bucket accumulator), or lhs is stored in
-        // the bucket and rhs is temporarily ignored
-        AffineElement* lhs_destination =
-            do_affine_add ? &affine_addition_scratch_space[affine_input_it] : &bucket_accumulators[lhs_bucket];
-        AffineElement* rhs_destination =
-            do_affine_add ? &affine_addition_scratch_space[affine_input_it + 1] : &null_location;
-
-        // if performing an affine add, set the destination bucket corresponding to the addition result
-        uint64_t& source_bucket_destination = affine_addition_output_bucket_destinations[affine_input_it >> 1];
-        source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
-
-        // unconditional swap. No if statements here.
-        *lhs_destination = *lhs_source;
-        *rhs_destination = *rhs_source;
-
-        // indicate whether bucket_accumulators[lhs_bucket] will contain a point after this iteration
-        bucket_accumulator_exists.set(
-            lhs_bucket,
-            (has_bucket_accumulator && buckets_match) || /* bucket has an accum and its not being used in current add */
-                !do_affine_add);                         /* lhs point is cached into the bucket */
-
-        affine_input_it += do_affine_add ? 2 : 0;
-        point_it += (do_affine_add && buckets_match) ? 2 : 1;
-    }
-    // We have to handle the last point as an edge case so that we dont overflow the bounds of `point_schedule`. If the
-    // bucket accumulator exists, we add the point to it, otherwise the point simply becomes the bucket accumulator.
-    if (point_it == num_points - 1) {
-        uint64_t lhs_schedule = point_schedule[point_it];
-        size_t lhs_bucket = static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
-        size_t lhs_point = static_cast<size_t>(lhs_schedule >> 32);
-        bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
-
-        if (has_bucket_accumulator) { // point is added to its bucket accumulator
-            affine_addition_scratch_space[affine_input_it] = points[lhs_point];
-            affine_addition_scratch_space[affine_input_it + 1] = bucket_accumulators[lhs_bucket];
-            bucket_accumulator_exists.set(lhs_bucket, false);
-            affine_addition_output_bucket_destinations[affine_input_it >> 1] = lhs_bucket;
-            affine_input_it += 2;
-            point_it += 1;
-        } else { // otherwise, cache the point into the bucket
-            BB_ASSERT_DEBUG(lhs_point < points.size());
-            bucket_accumulators[lhs_bucket] = points[lhs_point];
-            bucket_accumulator_exists.set(lhs_bucket, true);
-            point_it += 1;
+        // add_affine_points stores results in the top-half of scratch space
+        AffineElement* affine_output = affine_data.points_to_add.data() + (num_points_to_add / 2);
+
+        // Recirculate addition outputs back into scratch space or bucket accumulators
+        size_t new_scratch_it = 0;
+        size_t output_it = 0;
+        size_t num_outputs = num_points_to_add / 2;
+
+        while ((num_outputs > 1) && (output_it + 1 < num_outputs)) {
+            uint32_t lhs_bucket = affine_data.addition_result_bucket_destinations[output_it];
+            uint32_t rhs_bucket = affine_data.addition_result_bucket_destinations[output_it + 1];
+
+            process_bucket_pair(lhs_bucket,
+                                rhs_bucket,
+                                &affine_output[output_it],
+                                &affine_output[output_it + 1],
+                                affine_data,
+                                bucket_data,
+                                new_scratch_it,
+                                output_it);
         }
-    }
 
-    // Now that we have populated `affine_addition_scratch_space`,
-    // compute `num_affine_points_to_add` independent additions using the Affine trick
-    size_t num_affine_points_to_add = affine_input_it;
-    if (num_affine_points_to_add >= 2) {
-        add_affine_points(&affine_addition_scratch_space[0], num_affine_points_to_add, &scalar_scratch_space[0]);
-    }
-    // `add_affine_points` stores the result in the top-half of the used scratch space
-    G1* affine_output = &affine_addition_scratch_space[0] + (num_affine_points_to_add / 2);
-
-    // Process the addition outputs.
-    // We either need to feed the addition outputs back into affine_addition_scratch_space for more addition operations.
-    // Or, if there are no more additions for a bucket, we store the addition output in a bucket accumulator.
-    size_t new_scratch_space_it = 0;
-    size_t affine_output_it = 0;
-    size_t num_affine_output_points = num_affine_points_to_add / 2;
-    // This algorithm is equivalent to the one we used to populate `affine_addition_scratch_space` from the point
-    // schedule, however here we source points from a different location (the addition results)
-    while ((affine_output_it < (num_affine_output_points - 1)) && (num_affine_output_points > 0)) {
-        size_t lhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
-        size_t rhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it + 1]);
-        BB_ASSERT_DEBUG(lhs_bucket < bucket_accumulator_exists.size());
-
-        bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
-        bool buckets_match = (lhs_bucket == rhs_bucket);
-        bool do_affine_add = buckets_match || has_bucket_accumulator;
-
-        const AffineElement* lhs_source = &affine_output[affine_output_it];
-        const AffineElement* rhs_source =
-            buckets_match ? &affine_output[affine_output_it + 1] : &bucket_accumulators[lhs_bucket];
-
-        AffineElement* lhs_destination =
-            do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it] : &bucket_accumulators[lhs_bucket];
-        AffineElement* rhs_destination =
-            do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it + 1] : &null_location;
-
-        uint64_t& source_bucket_destination = output_point_schedule[new_scratch_space_it >> 1];
-        source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
-
-        *lhs_destination = *lhs_source;
-        *rhs_destination = *rhs_source;
-
-        bucket_accumulator_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
-        new_scratch_space_it += do_affine_add ? 2 : 0;
-        affine_output_it += (do_affine_add && buckets_match) ? 2 : 1;
-    }
-    // perform final iteration as edge case so we don't overflow `affine_addition_output_bucket_destinations`
-    if (affine_output_it == (num_affine_output_points - 1)) {
-
-        size_t lhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
-
-        bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
-        if (has_bucket_accumulator) {
-            BB_ASSERT_DEBUG(new_scratch_space_it + 1 < affine_addition_scratch_space.size());
-            BB_ASSERT_DEBUG(lhs_bucket < bucket_accumulators.size());
-            BB_ASSERT_DEBUG((new_scratch_space_it >> 1) < output_point_schedule.size());
-            affine_addition_scratch_space[new_scratch_space_it] = affine_output[affine_output_it];
-            affine_addition_scratch_space[new_scratch_space_it + 1] = bucket_accumulators[lhs_bucket];
-            bucket_accumulator_exists.set(lhs_bucket, false);
-            output_point_schedule[new_scratch_space_it >> 1] = lhs_bucket;
-            new_scratch_space_it += 2;
-            affine_output_it += 1;
-        } else {
-            bucket_accumulators[lhs_bucket] = affine_output[affine_output_it];
-            bucket_accumulator_exists.set(lhs_bucket, true);
-            affine_output_it += 1;
+        // Handle the last output (odd count case)
+        if (num_outputs > 0 && output_it == num_outputs - 1) {
+            uint32_t bucket = affine_data.addition_result_bucket_destinations[output_it];
+            process_single_point(
+                bucket, &affine_output[output_it], affine_data, bucket_data, new_scratch_it, output_it);
         }
-    }
 
-    // If we have not finished iterating over the point schedule,
-    // OR we have affine additions to perform in the scratch space, continue
-    if (point_it < num_points || new_scratch_space_it != 0) {
-        consume_point_schedule(point_schedule, points, affine_data, bucket_data, point_it, new_scratch_space_it);
+        // Continue with recirculated points
+        scratch_it = new_scratch_it;
     }
 }
 
-/**
- * @brief Compute multiple multi-scalar multiplications.
- * @details If we need to perform multiple MSMs, this method will be more efficient than calling `msm` repeatedly
- *          This is because this method will be able to dispatch equal work to all threads without splitting the input
- *          msms up so much.
- *          The Pippenger algorithm runtime is O(N/log(N)) so there will be slight gains as each inner-thread MSM will
- *          have a larger N.
- *          The input scalars are not const because the algorithm converts them out of Montgomery form and then back.
- *
- * @tparam Curve
- * @param points
- * @param scalars
- * @return std::vector<typename Curve::AffineElement>
- */
 template <typename Curve>
 std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
     std::span<std::span<const typename Curve::AffineElement>> points,
@@ -772,32 +483,29 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
     std::vector<std::vector<std::pair<Element, size_t>>> thread_msm_results(num_cpus);
     BB_ASSERT_EQ(thread_work_units.size(), num_cpus);
 
+    // Select Pippenger implementation once (hoisting branch outside hot loop)
+    // Jacobian: safe, handles edge cases | Affine: faster, assumes linearly independent points
+    auto pippenger_impl =
+        handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;
+
     // Once we have our work units, each thread can independently evaluate its assigned msms
     parallel_for(num_cpus, [&](size_t thread_idx) {
         if (!thread_work_units[thread_idx].empty()) {
             const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
             std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
+            msm_results.reserve(msms.size());
+
+            // Point schedule buffer for this thread - avoids per-work-unit heap allocation
+            std::vector<uint64_t> point_schedule_buffer;
+
             for (const MSMWorkUnit& msm : msms) {
-                std::span<const ScalarField> work_scalars = scalars[msm.batch_msm_index];
-                std::span<const AffineElement> work_points = points[msm.batch_msm_index];
-                std::span<const uint32_t> work_indices =
-                    std::span<const uint32_t>{ &msm_scalar_indices[msm.batch_msm_index][msm.start_index], msm.size };
-                std::vector<uint64_t> point_schedule(msm.size);
-                MSMData msm_data(work_scalars, work_points, work_indices, std::span<uint64_t>(point_schedule));
-                Element msm_result = Curve::Group::point_at_infinity;
-                constexpr size_t SINGLE_MUL_THRESHOLD = 16;
-                if (msm.size < SINGLE_MUL_THRESHOLD) {
-                    msm_result = small_mul<Curve>(work_scalars, work_points, msm_data.scalar_indices, msm.size);
-                } else {
-                    // Our non-affine method implicitly handles cases where Weierstrass edge cases may occur
-                    // Note: not as fast! use unsafe version if you know all input base points are linearly independent
-                    if (handle_edge_cases) {
-                        msm_result = small_pippenger_low_memory_with_transformed_scalars(msm_data);
-                    } else {
-                        msm_result = pippenger_low_memory_with_transformed_scalars(msm_data);
-                    }
-                }
-                msm_results.push_back(std::make_pair(msm_result, msm.batch_msm_index));
+                point_schedule_buffer.resize(msm.size);
+                MSMData msm_data =
+                    MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
+                Element msm_result =
+                    (msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);
+
+                msm_results.emplace_back(msm_result, msm.batch_msm_index);
             }
         }
     });
@@ -805,61 +513,49 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
     // Accumulate results. This part needs to be single threaded, but amount of work done here should be small
     // TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part
     // will be painful.
-    std::vector<Element> results(num_msms);
-    for (Element& ele : results) {
-        ele.self_set_infinity();
-    }
+    std::vector<Element> results(num_msms, Curve::Group::point_at_infinity);
     for (const auto& single_thread_msm_results : thread_msm_results) {
-        for (const std::pair<Element, size_t>& result : single_thread_msm_results) {
-            results[result.second] += result.first;
+        for (const auto& [element, index] : single_thread_msm_results) {
+            results[index] += element;
         }
     }
-    Element::batch_normalize(&results[0], num_msms);
+    Element::batch_normalize(results.data(), num_msms);
 
-    std::vector<AffineElement> affine_results;
-    for (const auto& ele : results) {
-        affine_results.emplace_back(AffineElement(ele.x, ele.y));
-    }
-
-    // Convert our scalars back into Montgomery form so they remain unchanged
-    for (auto& msm_scalars : scalars) {
-        parallel_for_range(msm_scalars.size(), [&](size_t start, size_t end) {
+    // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective
+    for (auto& scalar_span : scalars) {
+        parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
             for (size_t i = start; i < end; ++i) {
-                msm_scalars[i].self_to_montgomery_form();
+                scalar_span[i].self_to_montgomery_form();
             }
         });
     }
-    return affine_results;
+
+    return std::vector<AffineElement>(results.begin(), results.end());
 }
 
-/**
- * @brief Helper method to evaluate a single MSM. Internally calls `batch_multi_scalar_mul`
- *
- * @tparam Curve
- * @param points
- * @param _scalars
- * @return Curve::AffineElement
- */
 template <typename Curve>
 typename Curve::AffineElement MSM<Curve>::msm(std::span<const typename Curve::AffineElement> points,
-                                              PolynomialSpan<const ScalarField> _scalars,
+                                              PolynomialSpan<const ScalarField> scalars,
                                               bool handle_edge_cases) noexcept
 {
-    if (_scalars.size() == 0) {
+    if (scalars.size() == 0) {
         return Curve::Group::affine_point_at_infinity;
     }
-    BB_ASSERT_GTE(points.size(), _scalars.start_index + _scalars.size());
-
-    // unfortnately we need to remove const on this data type to prevent duplicating _scalars (which is typically
-    // large) We need to convert `_scalars` out of montgomery form for the MSM. We then convert the scalars back
-    // into Montgomery form at the end of the algorithm. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-    // TODO(https://github.com/AztecProtocol/barretenberg/issues/1449): handle const correctness.
-    ScalarField* scalars = const_cast<ScalarField*>(&_scalars[_scalars.start_index]);
-
-    std::vector<std::span<const AffineElement>> pp{ points.subspan(_scalars.start_index) };
-    std::vector<std::span<ScalarField>> ss{ std::span<ScalarField>(scalars, _scalars.size()) };
-    AffineElement result = batch_multi_scalar_mul(pp, ss, handle_edge_cases)[0];
-    return result;
+    const size_t num_scalars = scalars.size();
+    BB_ASSERT_GTE(points.size(), scalars.start_index + num_scalars);
+
+    // const_cast is safe: we convert from Montgomery, compute, then convert back.
+    // Scalars are unchanged from the caller's perspective.
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    ScalarField* scalar_ptr = const_cast<ScalarField*>(&scalars[scalars.start_index]);
+    std::span<ScalarField> scalar_span(scalar_ptr, num_scalars);
+
+    // Wrap into a size-1 batch and delegate to the general method that properly handles multi-threading
+    std::array<std::span<const AffineElement>, 1> points_batch{ points.subspan(scalars.start_index) };
+    std::array<std::span<ScalarField>, 1> scalars_batch{ scalar_span };
+
+    auto results = batch_multi_scalar_mul(std::span(points_batch), std::span(scalars_batch), handle_edge_cases);
+    return results[0];
 }
 
 template <typename Curve>
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp
index 448c61b99421..0f410bfe1a63 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp
@@ -12,10 +12,8 @@
 #include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"
 #include "barretenberg/polynomials/polynomial.hpp"
 
-#include "./process_buckets.hpp"
-#include "./scalar_multiplication.hpp"
-
 #include "./bitvector.hpp"
+#include "./process_buckets.hpp"
 namespace bb::scalar_multiplication {
 
 template <typename Curve> class MSM {
@@ -25,15 +23,70 @@ template <typename Curve> class MSM {
     using BaseField = typename Curve::BaseField;
     using AffineElement = typename Curve::AffineElement;
 
-    using G1 = AffineElement;
     static constexpr size_t NUM_BITS_IN_FIELD = ScalarField::modulus.get_msb() + 1;
 
+    // ======================= Algorithm Tuning Constants =======================
+    //
+    // These constants control the behavior of the Pippenger MSM algorithm.
+    // They are empirically tuned for performance on typical hardware.
+
+    // Below this threshold, use naive scalar multiplication instead of Pippenger
+    static constexpr size_t PIPPENGER_THRESHOLD = 16;
+
+    // Below this threshold, the affine batch inversion trick is not beneficial
+    // (cost of inversions exceeds savings from cheaper affine additions)
+    static constexpr size_t AFFINE_TRICK_THRESHOLD = 128;
+
+    // Maximum bits per scalar slice (2^20 = 1M buckets, far beyond practical use)
+    static constexpr size_t MAX_SLICE_BITS = 20;
+
+    // Number of points to look ahead for memory prefetching
+    static constexpr size_t PREFETCH_LOOKAHEAD = 32;
+
+    // Prefetch every N iterations (must be power of 2); mask is N-1 for efficient modulo
+    static constexpr size_t PREFETCH_INTERVAL = 16;
+    static constexpr size_t PREFETCH_INTERVAL_MASK = PREFETCH_INTERVAL - 1;
+
+    // ======================= Cost Model Constants =======================
+    //
+    // These constants define the relative costs of various operations,
+    // used to decide between algorithm variants.
+
+    // Cost of bucket accumulation relative to a single point addition
+    // (2 Jacobian adds per bucket, each ~2.5x cost of affine add)
+    static constexpr size_t BUCKET_ACCUMULATION_COST = 5;
+
+    // Field multiplications saved per group operation when using affine trick
+    static constexpr size_t AFFINE_TRICK_SAVINGS_PER_OP = 5;
+
+    // Extra cost of Jacobian group operation when Z coordinate != 1
+    static constexpr size_t JACOBIAN_Z_NOT_ONE_PENALTY = 5;
+
+    // Cost of computing 4-bit lookup table for modular exponentiation (14 muls)
+    static constexpr size_t INVERSION_TABLE_COST = 14;
+    // ===========================================================================
+
+    // Offset generator used in bucket reduction to probabilistically avoid incomplete-addition
+    // edge cases in the accumulator. Derived from domain-separated precomputed generators.
+    static const AffineElement& get_offset_generator() noexcept
+    {
+        static const AffineElement offset_generator = []() {
+            if constexpr (std::same_as<typename Curve::Group, bb::g1>) {
+                return get_precomputed_generators<typename Curve::Group, "ECCVM_OFFSET_GENERATOR", 1>()[0];
+            } else {
+                return get_precomputed_generators<typename Curve::Group, "DEFAULT_DOMAIN_SEPARATOR", 8>()[0];
+            }
+        }();
+        return offset_generator;
+    }
+
     /**
      * @brief MSMWorkUnit describes an MSM that may be part of a larger MSM
      * @details For a multi-MSM where each MSM has a variable size, we want to split the MSMs up
      *          such that every available thread has an equal amount of MSM work to perform.
-     *          The actual MSM algorithm used is single-threaded. This is beneficial because we get better scaling.
-     *
+     *          Each work unit is computed single-threaded; a single MSM may be split across
+     *          threads and reduced. This approach yields better scaling than thread-parallel
+     *          bucket accumulation.
      */
     struct MSMWorkUnit {
         size_t batch_msm_index = 0;
@@ -42,15 +95,43 @@ template <typename Curve> class MSM {
     };
     using ThreadWorkUnits = std::vector<MSMWorkUnit>;
 
+    /**
+     * @brief Container for MSM input data passed between algorithm stages
+     * @note scalars must be in NON-Montgomery form for correct bucket index computation
+     */
     struct MSMData {
-        std::span<const ScalarField> scalars;
-        std::span<const AffineElement> points;
-        std::span<const uint32_t> scalar_indices;
-        std::span<uint64_t> point_schedule;
+        std::span<const ScalarField> scalars;     // Scalars (non-Montgomery form)
+        std::span<const AffineElement> points;    // Input points
+        std::span<const uint32_t> scalar_indices; // Indices of nonzero scalars
+        std::span<uint64_t> point_schedule;       // Scratch space for point scheduling
+
+        /**
+         * @brief Factory method to construct MSMData from a work unit
+         * @details Extracts the appropriate slices from the full arrays based on MSMWorkUnit parameters
+         */
+        static MSMData from_work_unit(std::span<std::span<ScalarField>> all_scalars,
+                                      std::span<std::span<const AffineElement>> all_points,
+                                      const std::vector<std::vector<uint32_t>>& all_indices,
+                                      std::span<uint64_t> point_schedule_buffer,
+                                      const MSMWorkUnit& work_unit) noexcept
+        {
+            return MSMData{
+                .scalars = all_scalars[work_unit.batch_msm_index],
+                .points = all_points[work_unit.batch_msm_index],
+                .scalar_indices =
+                    std::span<const uint32_t>{ &all_indices[work_unit.batch_msm_index][work_unit.start_index],
+                                               work_unit.size },
+                .point_schedule = point_schedule_buffer,
+            };
+        }
     };
 
     /**
-     * @brief Temp data structure, one created per thread!
+     * @brief Affine bucket accumulators for the fast affine-trick Pippenger variant
+     * @details Used when handle_edge_cases=false. Stores buckets in affine coordinates,
+     *          enabling use of Montgomery's batch inversion trick. Does NOT handle
+     *          edge cases like point doubling or point at infinity.
+     * @note Allocated per-call for WASM compatibility.
      */
     struct BucketAccumulators {
         std::vector<AffineElement> buckets;
@@ -62,6 +143,13 @@ template <typename Curve> class MSM {
         {}
     };
 
+    /**
+     * @brief Jacobian bucket accumulators for the safe Pippenger variant
+     * @details Used when handle_edge_cases=true or when affine trick is not beneficial.
+     *          Stores buckets in Jacobian coordinates which correctly handle point
+     *          doubling and point at infinity edge cases.
+     * @note Allocated per-call (not thread_local) in the Jacobian Pippenger path.
+     */
     struct JacobianBucketAccumulators {
         std::vector<Element> buckets;
         BitVector bucket_exists;
@@ -72,81 +160,99 @@ template <typename Curve> class MSM {
         {}
     };
     /**
-     * @brief Temp data structure, one created per thread!
+     * @brief Scratch space for batched affine point additions (one per thread)
      */
     struct AffineAdditionData {
         static constexpr size_t BATCH_SIZE = 2048;
         // when adding affine points, we have an edge case where the number of points in the batch can overflow by 2
         static constexpr size_t BATCH_OVERFLOW_SIZE = 2;
         std::vector<AffineElement> points_to_add;
-        std::vector<BaseField> scalar_scratch_space;
-        std::vector<uint64_t> addition_result_bucket_destinations;
+        std::vector<BaseField> inversion_scratch_space; // Used for Montgomery batch inversion denominators
+        std::vector<uint32_t> addition_result_bucket_destinations;
+        AffineElement null_location{}; // Dummy write target for branchless conditional moves
 
         AffineAdditionData() noexcept
             : points_to_add(BATCH_SIZE + BATCH_OVERFLOW_SIZE)
-            , scalar_scratch_space(BATCH_SIZE + BATCH_OVERFLOW_SIZE)
+            , inversion_scratch_space(BATCH_SIZE + BATCH_OVERFLOW_SIZE)
             , addition_result_bucket_destinations(((BATCH_SIZE + BATCH_OVERFLOW_SIZE) / 2))
         {}
     };
-    static size_t get_num_rounds(size_t num_points) noexcept
+
+    /**
+     * @brief Packed point schedule entry: (point_index << 32) | bucket_index
+     * @details Used to sort points by their target bucket for cache-efficient processing
+     */
+    struct PointScheduleEntry {
+        uint64_t data;
+
+        [[nodiscard]] static constexpr PointScheduleEntry create(uint32_t point_index, uint32_t bucket_index) noexcept
+        {
+            return { (static_cast<uint64_t>(point_index) << 32) | bucket_index };
+        }
+        [[nodiscard]] constexpr uint32_t point_index() const noexcept { return static_cast<uint32_t>(data >> 32); }
+        [[nodiscard]] constexpr uint32_t bucket_index() const noexcept { return static_cast<uint32_t>(data); }
+    };
+
+    // ======================= Public Methods =======================
+    // See README.md for algorithm details and mathematical derivations.
+
+    /**
+     * @brief Main entry point for single MSM computation
+     * @param handle_edge_cases false (default): fast affine variant; true: safe Jacobian variant
+     * @note Scalars are temporarily modified but restored before returning
+     */
+    static AffineElement msm(std::span<const AffineElement> points,
+                             PolynomialSpan<const ScalarField> scalars,
+                             bool handle_edge_cases = false) noexcept;
+
+    /**
+     * @brief Compute multiple MSMs in parallel with work balancing
+     * @note Scalars are temporarily modified but restored before returning
+     * @see README.md "Parallelization"
+     */
+    static std::vector<AffineElement> batch_multi_scalar_mul(std::span<std::span<const AffineElement>> points,
+                                                             std::span<std::span<ScalarField>> scalars,
+                                                             bool handle_edge_cases = true) noexcept;
+
+    // ======================= Test-Visible Methods =======================
+    // Exposed for unit testing; not part of the public API.
+
+    static uint32_t get_num_rounds(size_t num_points) noexcept
     {
-        const size_t bits_per_slice = get_optimal_log_num_buckets(num_points);
-        const size_t num_rounds = (NUM_BITS_IN_FIELD + (bits_per_slice - 1)) / bits_per_slice;
-        return num_rounds;
+        const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points);
+        return static_cast<uint32_t>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
     }
+
+    /** @brief Batch add n/2 independent point pairs using Montgomery's trick */
     static void add_affine_points(AffineElement* points,
                                   const size_t num_points,
                                   typename Curve::BaseField* scratch_space) noexcept;
-    static void transform_scalar_and_get_nonzero_scalar_indices(std::span<typename Curve::ScalarField> scalars,
-                                                                std::vector<uint32_t>& consolidated_indices) noexcept;
 
-    static std::vector<ThreadWorkUnits> get_work_units(std::span<std::span<ScalarField>> scalars,
-                                                       std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept;
-    static uint32_t get_scalar_slice(const ScalarField& scalar, size_t round, size_t normal_slice_size) noexcept;
-    static size_t get_optimal_log_num_buckets(const size_t num_points) noexcept;
-    static bool use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept;
-
-    static Element small_pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept;
-    static Element pippenger_low_memory_with_transformed_scalars(MSMData& msm_data) noexcept;
-    static Element evaluate_small_pippenger_round(MSMData& msm_data,
-                                                  const size_t round_index,
-                                                  JacobianBucketAccumulators& bucket_data,
-                                                  Element previous_round_output,
-                                                  const size_t bits_per_slice) noexcept;
-
-    static Element evaluate_pippenger_round(MSMData& msm_data,
-                                            const size_t round_index,
-                                            AffineAdditionData& affine_data,
-                                            BucketAccumulators& bucket_data,
-                                            Element previous_round_output,
-                                            const size_t bits_per_slice) noexcept;
-
-    static void consume_point_schedule(std::span<const uint64_t> point_schedule,
-                                       std::span<const AffineElement> points,
-                                       AffineAdditionData& affine_data,
-                                       BucketAccumulators& bucket_data,
-                                       size_t num_input_points_processed,
-                                       size_t num_queued_affine_points) noexcept;
+    /** @brief Extract c-bit slice from scalar for bucket index computation */
+    static uint32_t get_scalar_slice(const ScalarField& scalar, size_t round, size_t slice_size) noexcept;
 
-    static std::vector<AffineElement> batch_multi_scalar_mul(std::span<std::span<const AffineElement>> points,
-                                                             std::span<std::span<ScalarField>> scalars,
-                                                             bool handle_edge_cases = true) noexcept;
-    static AffineElement msm(std::span<const AffineElement> points,
-                             PolynomialSpan<const ScalarField> _scalars,
-                             bool handle_edge_cases = false) noexcept;
+    /** @brief Compute optimal bits per slice by minimizing cost over c in [1, MAX_SLICE_BITS) */
+    static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept;
 
+    /** @brief Process sorted point schedule into bucket accumulators using batched affine additions */
+    static void batch_accumulate_points_into_buckets(std::span<const uint64_t> point_schedule,
+                                                     std::span<const AffineElement> points,
+                                                     AffineAdditionData& affine_data,
+                                                     BucketAccumulators& bucket_data) noexcept;
+
+    /** @brief Reduce buckets to single point using running (suffix) sum from high to low: R = sum(k * B_k) */
     template <typename BucketType> static Element accumulate_buckets(BucketType& bucket_accumulators) noexcept
     {
         auto& buckets = bucket_accumulators.buckets;
         BB_ASSERT_DEBUG(buckets.size() > static_cast<size_t>(0));
         int starting_index = static_cast<int>(buckets.size() - 1);
-        Element prefix_sum;
+        Element running_sum;
         bool found_start = false;
         while (!found_start && starting_index > 0) {
             const size_t idx = static_cast<size_t>(starting_index);
             if (bucket_accumulators.bucket_exists.get(idx)) {
 
-                prefix_sum = buckets[idx];
+                running_sum = buckets[idx];
                 found_start = true;
             } else {
                 starting_index -= 1;
@@ -156,32 +262,104 @@ template <typename Curve> class MSM {
             return Curve::Group::point_at_infinity;
         }
         BB_ASSERT_DEBUG(starting_index > 0);
-        AffineElement offset_generator = Curve::Group::affine_point_at_infinity;
-        if constexpr (std::same_as<typename Curve::Group, bb::g1>) {
-            constexpr auto gen = get_precomputed_generators<typename Curve::Group, "ECCVM_OFFSET_GENERATOR", 1>()[0];
-            offset_generator = gen;
-        } else {
-            constexpr auto gen = get_precomputed_generators<typename Curve::Group, "DEFAULT_DOMAIN_SEPARATOR", 8>()[0];
-            offset_generator = gen;
-        }
-        Element sum = prefix_sum + offset_generator;
-        for (int i = static_cast<int>(starting_index - 1); i > 0; --i) {
+        const auto& offset_generator = get_offset_generator();
+        Element sum = running_sum + offset_generator;
+        for (int i = starting_index - 1; i > 0; --i) {
             size_t idx = static_cast<size_t>(i);
             BB_ASSERT_DEBUG(idx < bucket_accumulators.bucket_exists.size());
             if (bucket_accumulators.bucket_exists.get(idx)) {
-
-                prefix_sum += buckets[idx];
+                running_sum += buckets[idx];
             }
-            sum += prefix_sum;
+            sum += running_sum;
         }
         return sum - offset_generator;
     }
+
+  private:
+    // ======================= Private Implementation =======================
+
+    /** @brief Convert scalars from Montgomery form and collect indices of nonzero scalars */
+    static void transform_scalar_and_get_nonzero_scalar_indices(std::span<ScalarField> scalars,
+                                                                std::vector<uint32_t>& nonzero_scalar_indices) noexcept;
+
+    /** @brief Distribute multiple MSMs across threads with balanced point counts */
+    static std::vector<ThreadWorkUnits> get_work_units(std::span<std::span<ScalarField>> scalars,
+                                                       std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept;
+
+    /** @brief Decide if batch inversion saves work vs Jacobian additions */
+    static bool use_affine_trick(size_t num_points, size_t num_buckets) noexcept;
+
+    /** @brief Pippenger using Jacobian buckets (handles edge cases: doubling, infinity) */
+    static Element jacobian_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept;
+
+    /** @brief Pippenger using affine buckets with batch inversion (faster, no edge case handling) */
+    static Element affine_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept;
+
+    // Helpers for batch_accumulate_points_into_buckets. Inlined for performance.
+
+    // Process single point: if bucket has accumulator, pair them for addition; else cache in bucket.
+    __attribute__((always_inline)) static void process_single_point(size_t bucket,
+                                                                    const AffineElement* point_source,
+                                                                    AffineAdditionData& affine_data,
+                                                                    BucketAccumulators& bucket_data,
+                                                                    size_t& scratch_it,
+                                                                    size_t& point_it) noexcept
+    {
+        bool has_accumulator = bucket_data.bucket_exists.get(bucket);
+        if (has_accumulator) {
+            affine_data.points_to_add[scratch_it] = *point_source;
+            affine_data.points_to_add[scratch_it + 1] = bucket_data.buckets[bucket];
+            bucket_data.bucket_exists.set(bucket, false);
+            affine_data.addition_result_bucket_destinations[scratch_it >> 1] = static_cast<uint32_t>(bucket);
+            scratch_it += 2;
+        } else {
+            bucket_data.buckets[bucket] = *point_source;
+            bucket_data.bucket_exists.set(bucket, true);
+        }
+        point_it += 1;
+    }
+
+    // Branchless bucket pair processing. Updates point_it (by 2 if same bucket, else 1) and scratch_it.
+    // See README.md "batch_accumulate_points_into_buckets Algorithm" for case analysis.
+    __attribute__((always_inline)) static void process_bucket_pair(size_t lhs_bucket,
+                                                                   size_t rhs_bucket,
+                                                                   const AffineElement* lhs_source,
+                                                                   const AffineElement* rhs_source_if_match,
+                                                                   AffineAdditionData& affine_data,
+                                                                   BucketAccumulators& bucket_data,
+                                                                   size_t& scratch_it,
+                                                                   size_t& point_it) noexcept
+    {
+        bool has_bucket_accumulator = bucket_data.bucket_exists.get(lhs_bucket);
+        bool buckets_match = lhs_bucket == rhs_bucket;
+        bool do_affine_add = buckets_match || has_bucket_accumulator;
+
+        const AffineElement* rhs_source = buckets_match ? rhs_source_if_match : &bucket_data.buckets[lhs_bucket];
+
+        AffineElement* lhs_destination =
+            do_affine_add ? &affine_data.points_to_add[scratch_it] : &bucket_data.buckets[lhs_bucket];
+        AffineElement* rhs_destination =
+            do_affine_add ? &affine_data.points_to_add[scratch_it + 1] : &affine_data.null_location;
+
+        uint32_t& dest_bucket = affine_data.addition_result_bucket_destinations[scratch_it >> 1];
+        dest_bucket = do_affine_add ? static_cast<uint32_t>(lhs_bucket) : dest_bucket;
+
+        *lhs_destination = *lhs_source;
+        *rhs_destination = *rhs_source;
+
+        bucket_data.bucket_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
+        scratch_it += do_affine_add ? 2 : 0;
+        point_it += (do_affine_add && buckets_match) ? 2 : 1;
+    }
 };
 
+/** @brief Safe MSM wrapper (defaults to handle_edge_cases=true) */
 template <typename Curve>
 typename Curve::Element pippenger(PolynomialSpan<const typename Curve::ScalarField> scalars,
                                   std::span<const typename Curve::AffineElement> points,
                                   bool handle_edge_cases = true) noexcept;
+
+/** @brief Fast MSM wrapper for linearly independent points (no edge case handling) */
 template <typename Curve>
 typename Curve::Element pippenger_unsafe(PolynomialSpan<const typename Curve::ScalarField> scalars,
                                          std::span<const typename Curve::AffineElement> points) noexcept;
@@ -189,5 +367,4 @@ typename Curve::Element pippenger_unsafe(PolynomialSpan<const typename Curve::Sc
 extern template class MSM<curve::Grumpkin>;
 extern template class MSM<curve::BN254>;
 
-// NEXT STEP ACCUMULATE BUVKETS
 } // namespace bb::scalar_multiplication
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
index f1b79f2d1f3c..b1d2ee42801a 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
@@ -55,6 +55,7 @@ template <class Curve> class ScalarMultiplicationTest : public ::testing::Test {
         }
         return AffineElement(expected_acc);
     }
+
     static void SetUpTestSuite()
     {
         generators.resize(num_points);
@@ -69,502 +70,573 @@ template <class Curve> class ScalarMultiplicationTest : public ::testing::Test {
             ASSERT_EQ(generators[i].x == generators[i + 1].x, false);
         }
     };
-};
 
-using CurveTypes = ::testing::Types<bb::curve::BN254, bb::curve::Grumpkin>;
-TYPED_TEST_SUITE(ScalarMultiplicationTest, CurveTypes);
+    // ======================= Test Methods =======================
 
-#define SCALAR_MULTIPLICATION_TYPE_ALIASES                                                                             \
-    using Curve = TypeParam;                                                                                           \
-    using ScalarField = typename Curve::ScalarField;                                                                   \
-    // using AffineElement = typename Curve::AffineEleent;
+    void test_get_scalar_slice()
+    {
+        constexpr uint32_t fr_size = 254;
+        constexpr uint32_t slice_bits = 7;
+        constexpr uint32_t num_slices = (fr_size + 6) / 7;
+        constexpr uint32_t last_slice_bits = fr_size - ((num_slices - 1) * slice_bits);
+
+        for (size_t x = 0; x < 100; ++x) {
+            uint256_t input_u256 = engine.get_random_uint256();
+            input_u256.data[3] = input_u256.data[3] & 0x3FFFFFFFFFFFFFFF; // 254 bits
+            while (input_u256 > ScalarField::modulus) {
+                input_u256 -= ScalarField::modulus;
+            }
+            std::vector<uint32_t> slices(num_slices);
+
+            uint256_t acc = input_u256;
+            for (uint32_t i = 0; i < num_slices; ++i) {
+                uint32_t mask = ((1U << slice_bits) - 1U);
+                uint32_t shift = slice_bits;
+                if (i == 0) {
+                    mask = ((1U << last_slice_bits) - 1U);
+                    shift = last_slice_bits;
+                }
+                slices[num_slices - 1 - i] = static_cast<uint32_t>((acc & mask).data[0]);
+                acc = acc >> shift;
+            }
 
-TYPED_TEST(ScalarMultiplicationTest, GetScalarSlice)
-{
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    const size_t fr_size = 254;
-    const size_t slice_bits = 7;
-    size_t num_slices = (fr_size + 6) / 7;
-    size_t last_slice_bits = fr_size - ((num_slices - 1) * slice_bits);
-
-    for (size_t x = 0; x < 100; ++x) {
-
-        uint256_t input_u256 = engine.get_random_uint256();
-        input_u256.data[3] = input_u256.data[3] & 0x3FFFFFFFFFFFFFFF; // 254 bits
-        while (input_u256 > ScalarField::modulus) {
-            input_u256 -= ScalarField::modulus;
-        }
-        std::vector<uint32_t> slices(num_slices);
-
-        uint256_t acc = input_u256;
-        for (size_t i = 0; i < num_slices; ++i) {
-            size_t mask = ((1 << slice_bits) - 1UL);
-            size_t shift = slice_bits;
-            if (i == 0) {
-                mask = ((1UL << last_slice_bits) - 1UL);
-                shift = last_slice_bits;
+            ScalarField input(input_u256);
+            input.self_from_montgomery_form();
+
+            ASSERT_EQ(input.data[0], input_u256.data[0]);
+            ASSERT_EQ(input.data[1], input_u256.data[1]);
+            ASSERT_EQ(input.data[2], input_u256.data[2]);
+            ASSERT_EQ(input.data[3], input_u256.data[3]);
+
+            for (uint32_t i = 0; i < num_slices; ++i) {
+                uint32_t result = scalar_multiplication::MSM<Curve>::get_scalar_slice(input, i, slice_bits);
+                EXPECT_EQ(result, slices[i]);
             }
-            slices[num_slices - 1 - i] = static_cast<uint32_t>((acc & mask).data[0]);
-            acc = acc >> shift;
         }
-        // uint256_t input_u256 = 0;
-
-        // for (size_t i = 0; i < num_slices; ++i) {
-        //     bool valid_slice = false;
-        //     while (!valid_slice) {
-        //         size_t mask = ((1 << slice_bits) - 1);
-        //         if (i == num_slices - 1) {
-        //             mask = ((1 << last_slice_bits) - 1);
-        //         }
-        //         const uint32_t slice = engine.get_random_uint32() & mask;
-
-        //         size_t shift = (fr_size - slice_bits - (i * slice_bits));
-        //         if (i == num_slices - 1) {
-        //             shift = 0;
-        //         }
-
-        //         const uint256_t new_input_u256 = input_u256 + (uint256_t(slice) << shift);
-        //         //   BB_ASSERT(new_input_u256 < fr::modulus);
-        //         if (new_input_u256 < fr::modulus) {
-        //             input_u256 = new_input_u256;
-        //             slices[i] = slice;
-        //             valid_slice = true;
-        //         }
-        //     }
-        // }
-
-        // BB_ASSERT(input_u256 < fr::modulus);
-        // while (input_u256 > fr::modulus) {
-        //     input_u256 -= fr::modulus;
-        // }
-        ScalarField input(input_u256);
-        input.self_from_montgomery_form();
-
-        ASSERT_EQ(input.data[0], input_u256.data[0]);
-        ASSERT_EQ(input.data[1], input_u256.data[1]);
-        ASSERT_EQ(input.data[2], input_u256.data[2]);
-        ASSERT_EQ(input.data[3], input_u256.data[3]);
-
-        for (size_t i = 0; i < num_slices; ++i) {
-
-            uint32_t result = scalar_multiplication::MSM<Curve>::get_scalar_slice(input, i, slice_bits);
-            EXPECT_EQ(result, slices[i]);
+    }
+
+    void test_consume_point_batch()
+    {
+        const size_t total_points = 30071;
+        const size_t num_buckets = 128;
+
+        std::vector<uint64_t> input_point_schedule;
+        for (size_t i = 0; i < total_points; ++i) {
+            uint64_t bucket = static_cast<uint64_t>(engine.get_random_uint8()) & 0x7f;
+            uint64_t schedule = static_cast<uint64_t>(bucket) + (static_cast<uint64_t>(i) << 32);
+            input_point_schedule.push_back(schedule);
+        }
+        typename scalar_multiplication::MSM<Curve>::AffineAdditionData affine_data;
+        typename scalar_multiplication::MSM<Curve>::BucketAccumulators bucket_data(num_buckets);
+        scalar_multiplication::MSM<Curve>::batch_accumulate_points_into_buckets(
+            input_point_schedule, generators, affine_data, bucket_data);
+
+        std::vector<Element> expected_buckets(num_buckets);
+        for (auto& e : expected_buckets) {
+            e.self_set_infinity();
+        }
+        for (size_t i = 0; i < total_points; ++i) {
+            uint64_t bucket = input_point_schedule[i] & 0xFFFFFFFF;
+            EXPECT_LT(static_cast<size_t>(bucket), num_buckets);
+            expected_buckets[static_cast<size_t>(bucket)] += generators[i];
+        }
+        for (size_t i = 0; i < num_buckets; ++i) {
+            if (!expected_buckets[i].is_point_at_infinity()) {
+                AffineElement expected(expected_buckets[i]);
+                EXPECT_EQ(expected, bucket_data.buckets[i]);
+            } else {
+                EXPECT_FALSE(bucket_data.bucket_exists.get(i));
+            }
         }
     }
-    //   fr test = 0;
-    // test.data[0] = 0b;
-    // test.data[1] = 0b010101
-}
 
-TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatch)
-{
-    using Curve = TypeParam;
-    using AffineElement = typename Curve::AffineElement;
-    // todo make this not a multiple of 10k
-    const size_t total_points = 30071;
-    const size_t num_buckets = 128;
+    void test_consume_point_batch_and_accumulate()
+    {
+        const size_t total_points = 30071;
+        const size_t num_buckets = 128;
+
+        std::vector<uint64_t> input_point_schedule;
+        for (size_t i = 0; i < total_points; ++i) {
+            uint64_t bucket = static_cast<uint64_t>(engine.get_random_uint8()) & 0x7f;
+            uint64_t schedule = static_cast<uint64_t>(bucket) + (static_cast<uint64_t>(i) << 32);
+            input_point_schedule.push_back(schedule);
+        }
+        typename scalar_multiplication::MSM<Curve>::AffineAdditionData affine_data;
+        typename scalar_multiplication::MSM<Curve>::BucketAccumulators bucket_data(num_buckets);
+        scalar_multiplication::MSM<Curve>::batch_accumulate_points_into_buckets(
+            input_point_schedule, generators, affine_data, bucket_data);
 
-    std::vector<uint64_t> input_point_schedule;
-    for (size_t i = 0; i < total_points; ++i) {
+        Element result = scalar_multiplication::MSM<Curve>::accumulate_buckets(bucket_data);
 
-        uint64_t bucket = static_cast<uint64_t>(engine.get_random_uint8()) & 0x7f;
+        Element expected_acc;
+        expected_acc.self_set_infinity();
+        size_t num_threads = get_num_cpus();
+        std::vector<Element> expected_accs(num_threads);
+        size_t range_per_thread = (total_points + num_threads - 1) / num_threads;
+        parallel_for(num_threads, [&](size_t thread_idx) {
+            Element expected_thread_acc;
+            expected_thread_acc.self_set_infinity();
+            size_t start = thread_idx * range_per_thread;
+            size_t end = (thread_idx == num_threads - 1) ? total_points : (thread_idx + 1) * range_per_thread;
+            bool skip = start >= total_points;
+            if (!skip) {
+                for (size_t i = start; i < end; ++i) {
+                    ScalarField scalar = input_point_schedule[i] & 0xFFFFFFFF;
+                    expected_thread_acc += generators[i] * scalar;
+                }
+            }
+            expected_accs[thread_idx] = expected_thread_acc;
+        });
 
-        uint64_t schedule = static_cast<uint64_t>(bucket) + (static_cast<uint64_t>(i) << 32);
-        input_point_schedule.push_back(schedule);
+        for (size_t i = 0; i < num_threads; ++i) {
+            expected_acc += expected_accs[i];
+        }
+        AffineElement expected(expected_acc);
+        EXPECT_EQ(AffineElement(result), expected);
     }
-    typename scalar_multiplication::MSM<Curve>::AffineAdditionData affine_data =
-        typename scalar_multiplication::MSM<Curve>::AffineAdditionData();
-    typename scalar_multiplication::MSM<Curve>::BucketAccumulators bucket_data(num_buckets);
-    scalar_multiplication::MSM<Curve>::consume_point_schedule(
-        input_point_schedule, TestFixture::generators, affine_data, bucket_data, 0, 0);
-
-    std::vector<typename Curve::Element> expected_buckets(num_buckets);
-    for (auto& e : expected_buckets) {
-        e.self_set_infinity();
+
+    void test_radix_sort_count_zero_entries()
+    {
+        const size_t total_points = 30071;
+
+        std::vector<uint64_t> input_point_schedule;
+        for (size_t i = 0; i < total_points; ++i) {
+            uint64_t bucket = static_cast<uint64_t>(engine.get_random_uint8()) & 0x7f;
+            uint64_t schedule = static_cast<uint64_t>(bucket) + (static_cast<uint64_t>(i) << 32);
+            input_point_schedule.push_back(schedule);
+        }
+
+        size_t result = scalar_multiplication::sort_point_schedule_and_count_zero_buckets(
+            &input_point_schedule[0], input_point_schedule.size(), 7);
+
+        // Verify zero entry count is correct
+        size_t expected = 0;
+        for (size_t i = 0; i < total_points; ++i) {
+            expected += static_cast<size_t>((input_point_schedule[i] & 0xFFFFFFFF) == 0);
+        }
+        EXPECT_EQ(result, expected);
+
+        // Verify the array is sorted by bucket index (lower 32 bits)
+        for (size_t i = 1; i < total_points; ++i) {
+            uint32_t prev_bucket = static_cast<uint32_t>(input_point_schedule[i - 1]);
+            uint32_t curr_bucket = static_cast<uint32_t>(input_point_schedule[i]);
+            EXPECT_LE(prev_bucket, curr_bucket) << "Array not sorted at index " << i;
+        }
     }
-    // std::cout << "computing expected" << std::endl;
-    for (size_t i = 0; i < total_points; ++i) {
-        uint64_t bucket = input_point_schedule[i] & 0xFFFFFFFF;
-        EXPECT_LT(static_cast<size_t>(bucket), num_buckets);
-        expected_buckets[static_cast<size_t>(bucket)] += TestFixture::generators[i];
+
+    void test_pippenger_low_memory()
+    {
+        std::span<ScalarField> test_scalars(&scalars[0], num_points);
+        AffineElement result =
+            scalar_multiplication::MSM<Curve>::msm(generators, PolynomialSpan<ScalarField>(0, test_scalars));
+        AffineElement expected = naive_msm(test_scalars, generators);
+        EXPECT_EQ(result, expected);
     }
-    for (size_t i = 0; i < num_buckets; ++i) {
-        if (!expected_buckets[i].is_point_at_infinity()) {
-            AffineElement expected(expected_buckets[i]);
-            EXPECT_EQ(expected, bucket_data.buckets[i]);
-        } else {
-            EXPECT_FALSE(bucket_data.bucket_exists.get(i));
+
+    void test_batch_multi_scalar_mul()
+    {
+        BB_BENCH_NAME("BatchMultiScalarMul");
+
+        const size_t num_msms = static_cast<size_t>(engine.get_random_uint8());
+        std::vector<AffineElement> expected(num_msms);
+
+        std::vector<std::vector<ScalarField>> batch_scalars_copies(num_msms);
+        std::vector<std::span<const AffineElement>> batch_points_span;
+        std::vector<std::span<ScalarField>> batch_scalars_spans;
+
+        size_t vector_offset = 0;
+        for (size_t k = 0; k < num_msms; ++k) {
+            const size_t num_pts = static_cast<size_t>(engine.get_random_uint16()) % 400;
+
+            ASSERT_LT(vector_offset + num_pts, num_points);
+            std::span<const AffineElement> batch_points(&generators[vector_offset], num_pts);
+
+            batch_scalars_copies[k].resize(num_pts);
+            for (size_t i = 0; i < num_pts; ++i) {
+                batch_scalars_copies[k][i] = scalars[vector_offset + i];
+            }
+
+            vector_offset += num_pts;
+            batch_points_span.push_back(batch_points);
+            batch_scalars_spans.push_back(batch_scalars_copies[k]);
+
+            expected[k] = naive_msm(batch_scalars_spans[k], batch_points_span[k]);
         }
+
+        std::vector<AffineElement> result =
+            scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(batch_points_span, batch_scalars_spans);
+
+        EXPECT_EQ(result, expected);
     }
-}
 
-TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatchAndAccumulate)
-{
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    using Element = typename Curve::Element;
-    using AffineElement = typename Curve::AffineElement;
+    void test_batch_multi_scalar_mul_sparse()
+    {
+        const size_t num_msms = 10;
+        std::vector<AffineElement> expected(num_msms);
 
-    // todo make this not a multiple of 10k
-    const size_t total_points = 30071;
-    const size_t num_buckets = 128;
+        std::vector<std::vector<ScalarField>> batch_scalars(num_msms);
+        std::vector<std::span<const AffineElement>> batch_points_span;
+        std::vector<std::span<ScalarField>> batch_scalars_spans;
 
-    std::vector<uint64_t> input_point_schedule;
-    for (size_t i = 0; i < total_points; ++i) {
+        for (size_t k = 0; k < num_msms; ++k) {
+            const size_t num_pts = 33;
+            auto& test_scalars = batch_scalars[k];
 
-        uint64_t bucket = static_cast<uint64_t>(engine.get_random_uint8()) & 0x7f;
+            test_scalars.resize(num_pts);
 
-        uint64_t schedule = static_cast<uint64_t>(bucket) + (static_cast<uint64_t>(i) << 32);
-        input_point_schedule.push_back(schedule);
-    }
-    typename scalar_multiplication::MSM<Curve>::AffineAdditionData affine_data =
-        typename scalar_multiplication::MSM<Curve>::AffineAdditionData();
-    typename scalar_multiplication::MSM<Curve>::BucketAccumulators bucket_data(num_buckets);
-    scalar_multiplication::MSM<Curve>::consume_point_schedule(
-        input_point_schedule, TestFixture::generators, affine_data, bucket_data, 0, 0);
-
-    Element result = scalar_multiplication::MSM<Curve>::accumulate_buckets(bucket_data);
-
-    Element expected_acc = Element();
-    expected_acc.self_set_infinity();
-    size_t num_threads = get_num_cpus();
-    std::vector<Element> expected_accs(num_threads);
-    size_t range_per_thread = (total_points + num_threads - 1) / num_threads;
-    parallel_for(num_threads, [&](size_t thread_idx) {
-        Element expected_thread_acc;
-        expected_thread_acc.self_set_infinity();
-        size_t start = thread_idx * range_per_thread;
-        size_t end = (thread_idx == num_threads - 1) ? total_points : (thread_idx + 1) * range_per_thread;
-        bool skip = start >= total_points;
-        if (!skip) {
-            for (size_t i = start; i < end; ++i) {
-                ScalarField scalar = input_point_schedule[i] & 0xFFFFFFFF;
-                expected_thread_acc += TestFixture::generators[i] * scalar;
+            size_t fixture_offset = k * num_pts;
+
+            std::span<AffineElement> batch_points(&generators[fixture_offset], num_pts);
+            for (size_t i = 0; i < 13; ++i) {
+                test_scalars[i] = 0;
+            }
+            for (size_t i = 13; i < 23; ++i) {
+                test_scalars[i] = scalars[fixture_offset + i + 13];
+            }
+            for (size_t i = 23; i < num_pts; ++i) {
+                test_scalars[i] = 0;
             }
+            batch_points_span.push_back(batch_points);
+            batch_scalars_spans.push_back(batch_scalars[k]);
+
+            expected[k] = naive_msm(batch_scalars[k], batch_points);
         }
-        expected_accs[thread_idx] = expected_thread_acc;
-    });
 
-    for (size_t i = 0; i < num_threads; ++i) {
-        expected_acc += expected_accs[i];
+        std::vector<AffineElement> result =
+            scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(batch_points_span, batch_scalars_spans);
+
+        EXPECT_EQ(result, expected);
     }
-    AffineElement expected(expected_acc);
-    EXPECT_EQ(AffineElement(result), expected);
-}
 
-TYPED_TEST(ScalarMultiplicationTest, RadixSortCountZeroEntries)
-{
-    const size_t total_points = 30071;
+    void test_msm()
+    {
+        const size_t start_index = 1234;
+        const size_t num_pts = num_points - start_index;
+
+        PolynomialSpan<ScalarField> scalar_span =
+            PolynomialSpan<ScalarField>(start_index, std::span<ScalarField>(&scalars[0], num_pts));
+        AffineElement result = scalar_multiplication::MSM<Curve>::msm(generators, scalar_span);
 
-    std::vector<uint64_t> input_point_schedule;
-    for (size_t i = 0; i < total_points; ++i) {
+        std::span<AffineElement> points(&generators[start_index], num_pts);
+        AffineElement expected = naive_msm(scalar_span.span, points);
+        EXPECT_EQ(result, expected);
+    }
+
+    void test_msm_all_zeroes()
+    {
+        const size_t start_index = 1234;
+        const size_t num_pts = num_points - start_index;
+        std::vector<ScalarField> test_scalars(num_pts, ScalarField::zero());
 
-        uint64_t bucket = static_cast<uint64_t>(engine.get_random_uint8()) & 0x7f;
+        PolynomialSpan<ScalarField> scalar_span = PolynomialSpan<ScalarField>(start_index, test_scalars);
+        AffineElement result = scalar_multiplication::MSM<Curve>::msm(generators, scalar_span);
 
-        uint64_t schedule = static_cast<uint64_t>(bucket) + (static_cast<uint64_t>(i) << 32);
-        input_point_schedule.push_back(schedule);
+        EXPECT_EQ(result, Group::affine_point_at_infinity);
     }
 
-    size_t result = scalar_multiplication::process_buckets_count_zero_entries(
-        &input_point_schedule[0], input_point_schedule.size(), 7);
-    size_t expected = 0;
-    for (size_t i = 0; i < total_points; ++i) {
-        expected += static_cast<size_t>((input_point_schedule[i] & 0xFFFFFFFF) == 0);
+    void test_msm_empty_polynomial()
+    {
+        std::vector<ScalarField> test_scalars;
+        std::vector<AffineElement> input_points;
+        PolynomialSpan<ScalarField> scalar_span = PolynomialSpan<ScalarField>(0, test_scalars);
+        AffineElement result = scalar_multiplication::MSM<Curve>::msm(input_points, scalar_span);
+
+        EXPECT_EQ(result, Group::affine_point_at_infinity);
     }
-    EXPECT_EQ(result, expected);
-}
 
-TYPED_TEST(ScalarMultiplicationTest, EvaluatePippengerRound)
-{
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    using AffineElement = typename Curve::AffineElement;
-    using Element = typename Curve::Element;
+    void test_scalars_unchanged_after_msm()
+    {
+        const size_t num_pts = 100;
+        std::vector<ScalarField> test_scalars(num_pts);
+        std::vector<ScalarField> scalars_copy(num_pts);
 
-    const size_t num_points = 2;
-    std::vector<ScalarField> scalars(num_points);
-    constexpr size_t NUM_BITS_IN_FIELD = fr::modulus.get_msb() + 1;
-    const size_t normal_slice_size = 7; // stop hardcoding
-    const size_t num_buckets = 1 << normal_slice_size;
-
-    const size_t num_rounds = (NUM_BITS_IN_FIELD + normal_slice_size - 1) / normal_slice_size;
-    typename scalar_multiplication::MSM<Curve>::AffineAdditionData affine_data =
-        typename scalar_multiplication::MSM<Curve>::AffineAdditionData();
-    typename scalar_multiplication::MSM<Curve>::BucketAccumulators bucket_data(num_buckets);
-
-    for (size_t round_index = num_rounds - 1; round_index < num_rounds; round_index++) {
-        const size_t num_bits_in_slice =
-            (round_index == (num_rounds - 1)) ? (NUM_BITS_IN_FIELD % normal_slice_size) : normal_slice_size;
-        for (size_t i = 0; i < num_points; ++i) {
-
-            size_t hi_bit = NUM_BITS_IN_FIELD - (round_index * normal_slice_size);
-            size_t lo_bit = hi_bit - normal_slice_size;
-            if (hi_bit < normal_slice_size) {
-                lo_bit = 0;
-            }
-            uint64_t slice = engine.get_random_uint64() & ((1ULL << num_bits_in_slice) - 1);
-            // at this point in the algo, scalars has been converted out of montgomery form
-            uint256_t scalar = uint256_t(slice) << lo_bit;
-            scalars[i].data[0] = scalar.data[0];
-            scalars[i].data[1] = scalar.data[1];
-            scalars[i].data[2] = scalar.data[2];
-            scalars[i].data[3] = scalar.data[3];
-            scalars[i].self_to_montgomery_form();
+        for (size_t i = 0; i < num_pts; ++i) {
+            test_scalars[i] = scalars[i];
+            scalars_copy[i] = test_scalars[i];
         }
 
-        std::vector<uint32_t> indices;
-        scalar_multiplication::MSM<Curve>::transform_scalar_and_get_nonzero_scalar_indices(scalars, indices);
+        std::span<const AffineElement> points(&generators[0], num_pts);
+        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
 
-        Element previous_round_output;
-        previous_round_output.self_set_infinity();
-        for (auto x : indices) {
-            ASSERT_LT(x, num_points);
-        }
-        std::vector<uint64_t> point_schedule(scalars.size());
-        typename scalar_multiplication::MSM<Curve>::MSMData msm_data(
-            scalars, TestFixture::generators, indices, point_schedule);
-        Element result = scalar_multiplication::MSM<Curve>::evaluate_pippenger_round(
-            msm_data, round_index, affine_data, bucket_data, previous_round_output, 7);
-        Element expected;
-        expected.self_set_infinity();
-        for (size_t i = 0; i < num_points; ++i) {
-            ScalarField baz = scalars[i].to_montgomery_form();
-            expected += (TestFixture::generators[i] * baz);
+        scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
+
+        for (size_t i = 0; i < num_pts; ++i) {
+            EXPECT_EQ(test_scalars[i], scalars_copy[i]) << "Scalar at index " << i << " was modified";
         }
-        size_t num_doublings = NUM_BITS_IN_FIELD - (normal_slice_size * (round_index + 1));
-        if (round_index == num_rounds - 1) {
-            num_doublings = 0;
+    }
+
+    void test_scalars_unchanged_after_batch_multi_scalar_mul()
+    {
+        const size_t num_msms = 3;
+        const size_t num_pts = 100;
+
+        std::vector<std::vector<ScalarField>> batch_scalars(num_msms);
+        std::vector<std::vector<ScalarField>> scalars_copies(num_msms);
+        std::vector<std::span<const AffineElement>> batch_points;
+        std::vector<std::span<ScalarField>> batch_scalar_spans;
+
+        for (size_t k = 0; k < num_msms; ++k) {
+            batch_scalars[k].resize(num_pts);
+            scalars_copies[k].resize(num_pts);
+
+            for (size_t i = 0; i < num_pts; ++i) {
+                batch_scalars[k][i] = scalars[k * num_pts + i];
+                scalars_copies[k][i] = batch_scalars[k][i];
+            }
+
+            batch_points.push_back(std::span<const AffineElement>(&generators[k * num_pts], num_pts));
+            batch_scalar_spans.push_back(batch_scalars[k]);
         }
-        for (size_t i = 0; i < num_doublings; ++i) {
-            result.self_dbl();
+
+        scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(batch_points, batch_scalar_spans);
+
+        for (size_t k = 0; k < num_msms; ++k) {
+            for (size_t i = 0; i < num_pts; ++i) {
+                EXPECT_EQ(batch_scalars[k][i], scalars_copies[k][i])
+                    << "Scalar at MSM " << k << ", index " << i << " was modified";
+            }
         }
-        EXPECT_EQ(AffineElement(result), AffineElement(expected));
     }
-}
 
-TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory)
-{
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    using AffineElement = typename Curve::AffineElement;
+    void test_scalar_one()
+    {
+        const size_t num_pts = 5;
+        std::vector<ScalarField> test_scalars(num_pts, ScalarField::one());
+        std::span<const AffineElement> points(&generators[0], num_pts);
 
-    const size_t num_points = TestFixture::num_points;
+        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
+        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
 
-    std::span<ScalarField> scalars(&TestFixture::scalars[0], num_points);
-    AffineElement result =
-        scalar_multiplication::MSM<Curve>::msm(TestFixture::generators, PolynomialSpan<ScalarField>(0, scalars));
+        Element expected;
+        expected.self_set_infinity();
+        for (size_t i = 0; i < num_pts; ++i) {
+            expected += points[i];
+        }
 
-    AffineElement expected = TestFixture::naive_msm(scalars, TestFixture::generators);
-    EXPECT_EQ(result, expected);
-}
+        EXPECT_EQ(result, AffineElement(expected));
+    }
 
-TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMul)
-{
-    BB_BENCH_NAME("BatchMultiScalarMul");
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    using AffineElement = typename Curve::AffineElement;
+    void test_scalar_minus_one()
+    {
+        const size_t num_pts = 5;
+        std::vector<ScalarField> test_scalars(num_pts, -ScalarField::one());
+        std::span<const AffineElement> points(&generators[0], num_pts);
 
-    const size_t num_msms = static_cast<size_t>(engine.get_random_uint8());
-    std::vector<AffineElement> expected(num_msms);
+        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
+        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
 
-    std::vector<std::span<const AffineElement>> batch_points_span;
-    std::vector<std::span<ScalarField>> batch_scalars_spans;
+        Element expected;
+        expected.self_set_infinity();
+        for (size_t i = 0; i < num_pts; ++i) {
+            expected -= points[i];
+        }
 
-    size_t vector_offset = 0;
-    for (size_t k = 0; k < num_msms; ++k) {
-        const size_t num_points = static_cast<size_t>(engine.get_random_uint16()) % 400;
+        EXPECT_EQ(result, AffineElement(expected));
+    }
 
-        ASSERT_LT(vector_offset + num_points, TestFixture::num_points);
-        std::span<ScalarField> batch_scalars(&TestFixture::scalars[vector_offset], num_points);
-        std::span<const AffineElement> batch_points(&TestFixture::generators[vector_offset], num_points);
+    void test_single_point()
+    {
+        std::vector<ScalarField> test_scalars = { scalars[0] };
+        std::span<const AffineElement> points(&generators[0], 1);
 
-        vector_offset += num_points;
-        batch_points_span.push_back(batch_points);
-        batch_scalars_spans.push_back(batch_scalars);
+        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
+        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
 
-        expected[k] = TestFixture::naive_msm(batch_scalars_spans[k], batch_points_span[k]);
+        AffineElement expected(points[0] * test_scalars[0]);
+        EXPECT_EQ(result, expected);
     }
 
-    std::vector<AffineElement> result =
-        scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(batch_points_span, batch_scalars_spans);
+    void test_size_thresholds()
+    {
+        std::vector<size_t> test_sizes = { 1, 2, 15, 16, 17, 50, 127, 128, 129, 256, 512 };
 
-    EXPECT_EQ(result, expected);
-}
+        for (size_t num_pts : test_sizes) {
+            ASSERT_LE(num_pts, num_points);
 
-TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulSparse)
-{
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    using AffineElement = typename Curve::AffineElement;
+            std::vector<ScalarField> test_scalars(num_pts);
+            for (size_t i = 0; i < num_pts; ++i) {
+                test_scalars[i] = scalars[i];
+            }
 
-    const size_t num_msms = 10;
-    std::vector<AffineElement> expected(num_msms);
+            std::span<const AffineElement> points(&generators[0], num_pts);
+            PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
 
-    std::vector<std::vector<ScalarField>> batch_scalars(num_msms);
-    std::vector<std::vector<AffineElement>> batch_input_points(num_msms);
-    std::vector<std::span<const AffineElement>> batch_points_span;
-    std::vector<std::span<ScalarField>> batch_scalars_spans;
+            AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
+            AffineElement expected = naive_msm(test_scalars, points);
 
-    for (size_t k = 0; k < num_msms; ++k) {
-        const size_t num_points = 33;
-        auto& scalars = batch_scalars[k];
+            EXPECT_EQ(result, expected) << "Failed for size " << num_pts;
+        }
+    }
 
-        scalars.resize(num_points);
+    void test_duplicate_points()
+    {
+        // Use enough points to trigger Pippenger (> PIPPENGER_THRESHOLD = 16)
+        const size_t num_pts = 32;
+        AffineElement base_point = generators[0];
 
-        size_t fixture_offset = k * num_points;
+        std::vector<AffineElement> points(num_pts, base_point);
+        std::vector<ScalarField> test_scalars(num_pts);
+        ScalarField scalar_sum = ScalarField::zero();
 
-        std::span<AffineElement> batch_points(&TestFixture::generators[fixture_offset], num_points);
-        for (size_t i = 0; i < 13; ++i) {
-            scalars[i] = 0;
-        }
-        for (size_t i = 13; i < 23; ++i) {
-            scalars[i] = TestFixture::scalars[fixture_offset + i + 13];
-        }
-        for (size_t i = 23; i < num_points; ++i) {
-            scalars[i] = 0;
+        for (size_t i = 0; i < num_pts; ++i) {
+            test_scalars[i] = scalars[i];
+            scalar_sum += test_scalars[i];
         }
-        batch_points_span.push_back(batch_points);
-        batch_scalars_spans.push_back(batch_scalars[k]);
 
-        expected[k] = TestFixture::naive_msm(batch_scalars[k], batch_points);
+        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
+        // Duplicate points are an edge case (P + P requires doubling, not addition).
+        // Must use handle_edge_cases=true for correctness with Pippenger.
+        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/true);
+
+        AffineElement expected(base_point * scalar_sum);
+        EXPECT_EQ(result, expected);
     }
 
-    std::vector<AffineElement> result =
-        scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(batch_points_span, batch_scalars_spans);
+    void test_mixed_zero_scalars()
+    {
+        const size_t num_pts = 100;
+        std::vector<ScalarField> test_scalars(num_pts);
+        Element expected;
+        expected.self_set_infinity();
 
-    EXPECT_EQ(result, expected);
-}
+        for (size_t i = 0; i < num_pts; ++i) {
+            if (i % 2 == 0) {
+                test_scalars[i] = ScalarField::zero();
+            } else {
+                test_scalars[i] = scalars[i];
+                expected += generators[i] * test_scalars[i];
+            }
+        }
 
-TYPED_TEST(ScalarMultiplicationTest, MSM)
-{
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    using AffineElement = typename Curve::AffineElement;
+        std::span<const AffineElement> points(&generators[0], num_pts);
+        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
 
-    const size_t start_index = 1234;
-    const size_t num_points = TestFixture::num_points - start_index;
+        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
+        EXPECT_EQ(result, AffineElement(expected));
+    }
 
-    PolynomialSpan<ScalarField> scalar_span =
-        PolynomialSpan<ScalarField>(start_index, std::span<ScalarField>(&TestFixture::scalars[0], num_points));
-    AffineElement result = scalar_multiplication::MSM<Curve>::msm(TestFixture::generators, scalar_span);
+    void test_pippenger_free_function()
+    {
+        const size_t num_pts = 200;
+        std::vector<ScalarField> test_scalars(num_pts);
+        for (size_t i = 0; i < num_pts; ++i) {
+            test_scalars[i] = scalars[i];
+        }
 
-    std::span<AffineElement> points(&TestFixture::generators[start_index], num_points);
-    AffineElement expected = TestFixture::naive_msm(scalar_span.span, points);
-    EXPECT_EQ(result, expected);
-}
+        std::span<const AffineElement> points(&generators[0], num_pts);
+        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
 
-TYPED_TEST(ScalarMultiplicationTest, MSMAllZeroes)
-{
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    using AffineElement = typename Curve::AffineElement;
+        auto result = scalar_multiplication::pippenger<Curve>(scalar_span, points);
 
-    const size_t start_index = 1234;
-    const size_t num_points = TestFixture::num_points - start_index;
-    std::vector<ScalarField> scalars(num_points);
+        AffineElement expected = naive_msm(test_scalars, points);
+        EXPECT_EQ(AffineElement(result), expected);
+    }
+
+    void test_pippenger_unsafe_free_function()
+    {
+        const size_t num_pts = 200;
+        std::vector<ScalarField> test_scalars(num_pts);
+        for (size_t i = 0; i < num_pts; ++i) {
+            test_scalars[i] = scalars[i];
+        }
 
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = 0;
+        std::span<const AffineElement> points(&generators[0], num_pts);
+        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
+
+        auto result = scalar_multiplication::pippenger_unsafe<Curve>(scalar_span, points);
+
+        AffineElement expected = naive_msm(test_scalars, points);
+        EXPECT_EQ(AffineElement(result), expected);
     }
+};
 
-    PolynomialSpan<ScalarField> scalar_span = PolynomialSpan<ScalarField>(start_index, scalars);
-    AffineElement result = scalar_multiplication::MSM<Curve>::msm(TestFixture::generators, scalar_span);
+using CurveTypes = ::testing::Types<bb::curve::BN254, bb::curve::Grumpkin>;
+TYPED_TEST_SUITE(ScalarMultiplicationTest, CurveTypes);
 
-    EXPECT_EQ(result, Curve::Group::affine_point_at_infinity);
-}
+// ======================= Test Wrappers =======================
 
+TYPED_TEST(ScalarMultiplicationTest, GetScalarSlice)
+{
+    this->test_get_scalar_slice();
+}
+TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatch)
+{
+    this->test_consume_point_batch();
+}
+TYPED_TEST(ScalarMultiplicationTest, ConsumePointBatchAndAccumulate)
+{
+    this->test_consume_point_batch_and_accumulate();
+}
+TYPED_TEST(ScalarMultiplicationTest, RadixSortCountZeroEntries)
+{
+    this->test_radix_sort_count_zero_entries();
+}
+TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory)
+{
+    this->test_pippenger_low_memory();
+}
+TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMul)
+{
+    this->test_batch_multi_scalar_mul();
+}
+TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulSparse)
+{
+    this->test_batch_multi_scalar_mul_sparse();
+}
+TYPED_TEST(ScalarMultiplicationTest, MSM)
+{
+    this->test_msm();
+}
+TYPED_TEST(ScalarMultiplicationTest, MSMAllZeroes)
+{
+    this->test_msm_all_zeroes();
+}
 TYPED_TEST(ScalarMultiplicationTest, MSMEmptyPolynomial)
 {
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-    using AffineElement = typename Curve::AffineElement;
-
-    const size_t num_points = 0;
-    std::vector<ScalarField> scalars(num_points);
-    std::vector<AffineElement> input_points(num_points);
-    PolynomialSpan<ScalarField> scalar_span = PolynomialSpan<ScalarField>(0, scalars);
-    AffineElement result = scalar_multiplication::MSM<Curve>::msm(input_points, scalar_span);
-
-    EXPECT_EQ(result, Curve::Group::affine_point_at_infinity);
+    this->test_msm_empty_polynomial();
 }
-
-// Helper function to generate scalars with specified sparsity
-template <typename ScalarField>
-std::vector<ScalarField> generate_sparse_scalars(size_t num_scalars, double sparsity_rate, auto& rng)
+TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterMSM)
 {
-    std::vector<ScalarField> scalars(num_scalars);
-    for (size_t i = 0; i < num_scalars; ++i) {
-        // Generate random value to determine if this scalar should be zero
-        double rand_val = static_cast<double>(rng.get_random_uint32()) / static_cast<double>(UINT32_MAX);
-        if (rand_val < sparsity_rate) {
-            scalars[i] = 0;
-        } else {
-            scalars[i] = ScalarField::random_element(&rng);
-        }
-    }
-    return scalars;
+    this->test_scalars_unchanged_after_msm();
 }
-
-// Test different MSM strategies with detailed benchmarking
-// NOTE this requres BB_BENCH=1 to be set before the test command
-TYPED_TEST(ScalarMultiplicationTest, BenchBatchMsm)
+TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterBatchMultiScalarMul)
 {
-#ifndef __wasm__
-    if (!bb::detail::use_bb_bench) {
-#else
-    {
-#endif
-        std::cout
-            << "Skipping BatchMultiScalarMulStrategyComparison as BB_BENCH=1 is not passed (OR we are in wasm).\n";
-        return;
-    }
-    SCALAR_MULTIPLICATION_TYPE_ALIASES
-
-    using AffineElement = typename Curve::AffineElement;
-
-    const size_t num_msms = 3;
-    const size_t msm_max_size = 1 << 17;
-    const double max_sparsity = 0.1;
-
-    // Generate test data with varying sparsity
-    std::vector<std::span<const AffineElement>> all_points;
-    std::vector<std::span<ScalarField>> all_scalars;
-    std::vector<AffineElement> all_commitments;
-    std::vector<std::vector<ScalarField>> scalar_storage;
-
-    for (size_t i = 0; i < num_msms; ++i) {
-        // Generate random sizes and density of 0s
-        const size_t size = engine.get_random_uint64() % msm_max_size;
-        const double sparsity = engine.get_random_uint8() / 255.0 * max_sparsity;
-        auto scalars = generate_sparse_scalars<ScalarField>(size, sparsity, engine);
-        scalar_storage.push_back(std::move(scalars));
-
-        std::span<const AffineElement> points(&TestFixture::generators[i], size);
-        all_points.push_back(points);
-        all_scalars.push_back(scalar_storage.back());
-        all_commitments.push_back(TestFixture::naive_msm(all_scalars.back(), all_points.back()));
-    }
-    auto func = [&]<bb::detail::OperationLabel thread_prefix>(size_t num_threads) {
-        set_parallel_for_concurrency(num_threads);
-        // Strategy 1: Individual MSMs
-        {
-            BB_BENCH_NAME((bb::detail::concat<thread_prefix, "IndividualMSMs">()));
-            for (size_t i = 0; i < num_msms; ++i) {
-                std::vector<std::span<const AffineElement>> single_points = { all_points[i] };
-                std::vector<std::span<ScalarField>> single_scalars = { all_scalars[i] };
-                auto result = scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(single_points, single_scalars);
-                EXPECT_EQ(result[0], all_commitments[i]);
-            }
-        }
-        // Strategy 2: Batch
-        {
-            BB_BENCH_NAME((bb::detail::concat<thread_prefix, "BatchMSMs">()));
-            auto result = scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(all_points, all_scalars);
-            EXPECT_EQ(result, all_commitments);
-        }
-    };
-    // call lambda with template param
-    func.template operator()<"1 thread ">(1);
-    func.template operator()<"2 threads ">(2);
-    func.template operator()<"4 threads ">(4);
-    func.template operator()<"8 threads ">(8);
-    func.template operator()<"16 threads ">(16);
-    func.template operator()<"32 threads ">(32);
+    this->test_scalars_unchanged_after_batch_multi_scalar_mul();
+}
+TYPED_TEST(ScalarMultiplicationTest, ScalarOne)
+{
+    this->test_scalar_one();
+}
+TYPED_TEST(ScalarMultiplicationTest, ScalarMinusOne)
+{
+    this->test_scalar_minus_one();
+}
+TYPED_TEST(ScalarMultiplicationTest, SinglePoint)
+{
+    this->test_single_point();
+}
+TYPED_TEST(ScalarMultiplicationTest, SizeThresholds)
+{
+    this->test_size_thresholds();
+}
+TYPED_TEST(ScalarMultiplicationTest, DuplicatePoints)
+{
+    this->test_duplicate_points();
+}
+TYPED_TEST(ScalarMultiplicationTest, MixedZeroScalars)
+{
+    this->test_mixed_zero_scalars();
+}
+TYPED_TEST(ScalarMultiplicationTest, PippengerFreeFunction)
+{
+    this->test_pippenger_free_function();
+}
+TYPED_TEST(ScalarMultiplicationTest, PippengerUnsafeFreeFunction)
+{
+    this->test_pippenger_unsafe_free_function();
 }
 
+// Non-templated test for explicit small inputs
 TEST(ScalarMultiplication, SmallInputsExplicit)
 {
     uint256_t x0(0x68df84429941826a, 0xeb08934ed806781c, 0xc14b6a2e4f796a73, 0x08dc1a9a11a3c8db);
diff --git a/barretenberg/cpp/src/barretenberg/srs/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/srs/scalar_multiplication.test.cpp
index 433fcc95056d..db44c3976c92 100644
--- a/barretenberg/cpp/src/barretenberg/srs/scalar_multiplication.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/srs/scalar_multiplication.test.cpp
@@ -121,9 +121,9 @@ TYPED_TEST(ScalarMultiplicationTests, RadixSort)
         Fr::__copy(source_scalar, scalars[i]);
     }
 
-    size_t bits_per_slice = scalar_multiplication::MSM<Curve>::get_optimal_log_num_buckets(target_degree);
+    uint32_t bits_per_slice = scalar_multiplication::MSM<Curve>::get_optimal_log_num_buckets(target_degree);
 
-    for (size_t i = 0; i < num_rounds; ++i) {
+    for (uint32_t i = 0; i < num_rounds; ++i) {
 
         std::vector<uint64_t> scalar_slices(target_degree);
         std::vector<uint64_t> sorted_scalar_slices(target_degree);
@@ -132,7 +132,7 @@ TYPED_TEST(ScalarMultiplicationTests, RadixSort)
             scalar_slices[j] = scalar_multiplication::MSM<Curve>::get_scalar_slice(scalars[j], i, bits_per_slice);
             sorted_scalar_slices[j] = scalar_slices[j];
         }
-        scalar_multiplication::process_buckets_count_zero_entries(
+        scalar_multiplication::sort_point_schedule_and_count_zero_buckets(
             &sorted_scalar_slices[0], target_degree, static_cast<uint32_t>(bits_per_slice));
 
         const auto find_entry = [scalar_slices, num_entries = target_degree](auto x) {