From 6fb3dea410b6f816d42c5d5340c338d7b78d5ff8 Mon Sep 17 00:00:00 2001
From: notnotraju <raju@aztec-labs.com>
Date: Wed, 18 Feb 2026 13:35:40 +0000
Subject: [PATCH 1/8] added tests showing edge-case failure for
 splitting_scalars and EC scalar multiplication

Tests for the negative-k2 bug in split_into_endomorphism_scalars (Fr and Fq),
plus EC-level scalar multiplication tests (g1 and grumpkin) showing the bug
produces wrong points. Includes endomorphism_scalars.py for deriving the
boundary scalars.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../barretenberg/ecc/curves/bn254/fq.test.cpp |  31 ++
 .../barretenberg/ecc/curves/bn254/fr.test.cpp | 151 +-------
 .../barretenberg/ecc/curves/bn254/g1.test.cpp |  56 +++
 .../ecc/curves/grumpkin/grumpkin.test.cpp     |  58 +++
 .../ecc/fields/endomorphism_scalars.py        | 366 ++++++++++++++++++
 5 files changed, 532 insertions(+), 130 deletions(-)
 create mode 100644 barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py

diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp
index 9ea7717545f4..171685751fd5 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp
@@ -130,6 +130,37 @@ TEST(BN254Fq, SplitIntoEndomorphismScalarsSimple)
     }
 }
 
+// Negative-k2 bug: k = ceil(m * 2^256 / endo_g2) produces negative k2 in the GLV splitting,
+// and the 128-bit truncation extracts the wrong value. See endomorphism_scalars.py for derivation.
+TEST(BN254Fq, SplitEndomorphismNegativeK2)
+{
+    // clang-format off
+    struct test_case { std::array<uint64_t, 4> limbs; const char* tag; };
+    const std::array<test_case, 3> cases = {{
+        {{ 0x71922da036dca5f4, 0xd970a56127fb8227, 0x59e26bcea0d48bac, 0x0 }, "m=1"},
+        {{ 0xe3245b406db94be8, 0xb2e14ac24ff7044e, 0xb3c4d79d41a91759, 0x0 }, "m=2"},
+        {{ 0x54b688e0a495f1dc, 0x8c51f02377f28676, 0x0da7436be27da306, 0x1 }, "m=3"},
+    }};
+    // clang-format on
+
+    fq lambda = fq::cube_root_of_unity();
+
+    for (const auto& tc : cases) {
+        fq k{ tc.limbs[0], tc.limbs[1], tc.limbs[2], tc.limbs[3] };
+        fq k1{ 0, 0, 0, 0 };
+        fq k2{ 0, 0, 0, 0 };
+
+        fq::split_into_endomorphism_scalars(k, k1, k2);
+
+        k1.self_to_montgomery_form();
+        k2.self_to_montgomery_form();
+        fq result = k1 - k2 * lambda;
+        result.self_from_montgomery_form();
+
+        EXPECT_NE(result, k) << "Bug may be fixed! " << tc.tag;
+    }
+}
+
 TEST(BN254Fq, SplitIntoEndomorphismEdgeCase)
 {
     fq input = { 0, 0, 1, 0 }; // 2^128
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp
index 0e11abf144c7..4f81411669c4 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp
@@ -1,144 +1,35 @@
-/**
- * @brief BN254 scalar field (fr) specific tests.
- *
- * Other field arithmetic tests (both compile-time and runtime) are in ecc/fields/generic_field.test.cpp and
- * ecc/fields/prime_field.test.cpp. This file contains only BN254 scalar field specific functionality:
- * - Fixed compile-time tests with field-specific expected values
- * - Multiplicative generator (AUDITTODO: delete)
- * - Endomorphism scalar decomposition
- */
-
 #include "fr.hpp"
-#include "barretenberg/numeric/random/engine.hpp"
 #include <gtest/gtest.h>
 
 using namespace bb;
 
-namespace {
-auto& engine = numeric::get_debug_randomness();
-} // namespace
-
-// ================================
-// Fixed Compile-Time Tests (field-specific expected values)
-// These tests use hardcoded expected values that are only valid for native builds (R = 2^256).
-// WASM uses R = 2^261.
-// ================================
-
-#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
-TEST(BN254Fr, CompileTimeMultiplication)
-{
-    constexpr fr a{ 0x20565a572c565a66, 0x7bccd0f01f5f7bff, 0x63ec2beaad64711f, 0x624953caaf44a814 };
-    constexpr fr b{ 0xa17307a2108adeea, 0x74629976c14c5e2b, 0x9ce6f072ab1740ee, 0x398c753702b2bef0 };
-    constexpr fr expected{ 0xe8cdd06343386834, 0x8cbb3f556258a9af, 0x5aef2f34f2d66fd4, 0x2d8263c7e10213ca };
-
-    constexpr fr result = a * b;
-    static_assert(result == expected);
-}
-
-TEST(BN254Fr, CompileTimeSquaring)
+// Negative-k2 bug: k = ceil(m * 2^256 / endo_g2) produces negative k2 in the GLV splitting,
+// and the 128-bit truncation extracts the wrong value. See endomorphism_scalars.py for derivation.
+TEST(BN254Fr, SplitEndomorphismNegativeK2)
 {
-    constexpr fr a{ 0x20565a572c565a66, 0x7bccd0f01f5f7bff, 0x63ec2beaad64711f, 0x624953caaf44a814 };
-    constexpr fr expected{ 0x3e928bdb06267b99, 0x1e5834571f52dfbf, 0x3d63bdf9bf7d0d4b, 0x353bb31adaa033c7 };
-
-    constexpr fr result = a.sqr();
-    static_assert(result == expected);
-}
-
-TEST(BN254Fr, CompileTimeAddition)
-{
-    constexpr fr a{ 0x20565a572c565a66, 0x7bccd0f01f5f7bff, 0x63ec2beaad64711f, 0x624953caaf44a814 };
-    constexpr fr b{ 0xa17307a2108adeea, 0x74629976c14c5e2b, 0x9ce6f072ab1740ee, 0x398c753702b2bef0 };
-    constexpr fr expected{ 0x3a0576d15ce1394e, 0x9fc799d5ed38f908, 0x903290f055790153, 0x3b0d2c1bef9426b1 };
-
-    constexpr fr result = a + b;
-    static_assert(result == expected);
-}
-
-TEST(BN254Fr, CompileTimeSubtraction)
-{
-    constexpr fr a{ 0xcfbcfcf457cf2d38, 0x7b27af26ce62aa61, 0xf0378e90d48f2b92, 0x4734b22cb21ded };
-    constexpr fr b{ 0x569fdb1db5198770, 0x446ddccef8347d52, 0xef215227182d22a, 0x8281b4fb109306 };
-    constexpr fr expected{ 0xe10cfe82b5a5ca, 0x8721a2e8c9a10e32, 0x51e604db660f0a22, 0x608d4fe2f404cb3b };
-
-    constexpr fr result = a - b;
-    static_assert(result == expected);
-}
-#endif
-
-TEST(BN254Fr, CompileTimeInversion)
-{
-    constexpr fr a{ 0x20565a572c565a66, 0x7bccd0f01f5f7bff, 0x63ec2beaad64711f, 0x624953caaf44a814 };
-    constexpr fr inv = a.invert();
-    // Verify a * a^-1 = 1
-    static_assert(a * inv == fr::one());
-}
-
-// ================================
-// BN254 Scalar Field Specific
-// ================================
-// AUDITTODO: delete this (`multiplicative_generator` is misnamed and is no longer used.)
-TEST(BN254Fr, MultiplicativeGenerator)
-{
-    EXPECT_EQ(fr::multiplicative_generator(), fr(5));
-}
-
-TEST(BN254Fr, SplitIntoEndomorphismScalars)
-{
-    fr k = fr::random_element();
-    fr k1 = { 0, 0, 0, 0 };
-    fr k2 = { 0, 0, 0, 0 };
-
-    fr::split_into_endomorphism_scalars(k, k1, k2);
-
-    fr result{ 0, 0, 0, 0 };
-
-    k1.self_to_montgomery_form();
-    k2.self_to_montgomery_form();
+    // clang-format off
+    struct test_case { std::array<uint64_t, 4> limbs; const char* tag; };
+    const std::array<test_case, 3> cases = {{
+        {{ 0x01624731e1195570, 0x3ba491482db4da14, 0x59e26bcea0d48bac, 0x0 }, "m=1"},
+        {{ 0x02c48e63c232aadf, 0x774922905b69b428, 0xb3c4d79d41a91758, 0x0 }, "m=2"},
+        {{ 0x0426d595a34c004e, 0xb2edb3d8891e8e3c, 0x0da7436be27da304, 0x1 }, "m=3"},
+    }};
+    // clang-format on
 
     fr lambda = fr::cube_root_of_unity();
-    result = k2 * lambda;
-    result = k1 - result;
 
-    result.self_from_montgomery_form();
-    EXPECT_EQ(result, k);
-}
-
-TEST(BN254Fr, SplitIntoEndomorphismScalarsSimple)
-{
-    fr input = { 1, 0, 0, 0 };
-    fr k = { 0, 0, 0, 0 };
-    fr k1 = { 0, 0, 0, 0 };
-    fr k2 = { 0, 0, 0, 0 };
-    fr::__copy(input, k);
+    for (const auto& tc : cases) {
+        fr k{ tc.limbs[0], tc.limbs[1], tc.limbs[2], tc.limbs[3] };
+        fr k1{ 0, 0, 0, 0 };
+        fr k2{ 0, 0, 0, 0 };
 
-    fr::split_into_endomorphism_scalars(k, k1, k2);
+        fr::split_into_endomorphism_scalars(k, k1, k2);
 
-    fr result{ 0, 0, 0, 0 };
-    k1.self_to_montgomery_form();
-    k2.self_to_montgomery_form();
-
-    fr lambda = fr::cube_root_of_unity();
-    result = k2 * lambda;
-    result = k1 - result;
+        k1.self_to_montgomery_form();
+        k2.self_to_montgomery_form();
+        fr result = k1 - k2 * lambda;
+        result.self_from_montgomery_form();
 
-    result.self_from_montgomery_form();
-    for (size_t i = 0; i < 4; ++i) {
-        EXPECT_EQ(result.data[i], k.data[i]);
+        EXPECT_NE(result, k) << "Bug may be fixed! " << tc.tag;
     }
 }
-
-// ================================
-// Regression / Optimization Tests
-// ================================
-
-// Tests that (lo + 2^256 * hi) mod r == ((lo|hi) % r) in uint512_t
-// This validates the optimization of avoiding slow uint512_t modulo
-TEST(BN254Fr, EquivalentRandomness)
-{
-    uint512_t random_uint512 = engine.get_random_uint512();
-    auto random_lo = fr(random_uint512.lo);
-    auto random_hi = fr(random_uint512.hi);
-    uint512_t r(fr::modulus);
-    constexpr auto pow_2_256 = fr(uint256_t(1) << 128).sqr();
-    EXPECT_EQ(random_lo + pow_2_256 * random_hi, fr((random_uint512 % r).lo));
-}
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/g1.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/g1.test.cpp
index dec5d260d9d4..d1b3ca1e60ab 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/g1.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/g1.test.cpp
@@ -5,6 +5,24 @@
 
 using namespace bb;
 
+namespace {
+// Double-and-add scalar mul without endomorphism, used as reference for differential testing.
+template <typename Group, typename Fr>
+typename Group::affine_element naive_scalar_mul(const typename Group::element& base, const Fr& scalar)
+{
+    typename Group::element acc = Group::point_at_infinity;
+    typename Group::element runner = base;
+    uint256_t bits(scalar.from_montgomery_form());
+    for (size_t i = 0; i < 256; ++i) {
+        if (bits.get_bit(i)) {
+            acc = acc + runner;
+        }
+        runner = runner.dbl();
+    }
+    return typename Group::affine_element(acc);
+}
+} // namespace
+
 TEST(g1, RandomElement)
 {
     g1::element result = g1::element::random_element();
@@ -428,3 +446,41 @@ TEST(g1, CheckPrecomputedGenerators)
     ASSERT_TRUE((bb::check_precomputed_generators<g1, "ECCVM_OFFSET_GENERATOR", 1UL>()));
     ASSERT_TRUE((bb::check_precomputed_generators<g1, "test generators", 2UL>()));
 }
+
+// Negative-k2 bug: boundary scalars k = ceil(m * 2^256 / endo_g2) (from endomorphism_scalars.py)
+// each start a band ~2^123-2^126 wide where operator* gives wrong results vs naive double-and-add.
+TEST(g1, ScalarMulNegativeK2EdgeCases)
+{
+    // clang-format off
+    struct test_case { std::array<uint64_t, 4> limbs; const char* tag; };
+    const std::array<test_case, 3> boundary_cases = {{
+        {{ 0x01624731e1195570, 0x3ba491482db4da14, 0x59e26bcea0d48bac, 0x0 }, "m=1"},
+        {{ 0x02c48e63c232aadf, 0x774922905b69b428, 0xb3c4d79d41a91758, 0x0 }, "m=2"},
+        {{ 0x0426d595a34c004e, 0xb2edb3d8891e8e3c, 0x0da7436be27da304, 0x1 }, "m=3"},
+    }};
+    // clang-format on
+
+    for (const auto& tc : boundary_cases) {
+        fr base_scalar{ tc.limbs[0], tc.limbs[1], tc.limbs[2], tc.limbs[3] };
+        base_scalar.self_to_montgomery_form();
+
+        g1::affine_element endo_result(g1::one * base_scalar);
+        g1::affine_element naive_result = naive_scalar_mul<g1, fr>(g1::one, base_scalar);
+        EXPECT_EQ(naive_result.on_curve(), true) << tc.tag;
+        EXPECT_EQ(endo_result.on_curve(), true) << tc.tag;
+        EXPECT_NE(endo_result, naive_result) << "Bug may be fixed! " << tc.tag;
+
+        // Random samples within the band (narrowest is ~2^123; we use 122-bit offsets).
+        for (size_t i = 0; i < 100; ++i) {
+            uint256_t rand_bits(fr::random_element().from_montgomery_form());
+            uint256_t offset_int = (rand_bits & ((uint256_t(1) << 122) - 1)) + 1;
+            fr scalar = base_scalar + fr(offset_int);
+
+            g1::affine_element endo_res(g1::one * scalar);
+            g1::affine_element naive_res = naive_scalar_mul<g1, fr>(g1::one, scalar);
+            EXPECT_EQ(naive_res.on_curve(), true) << tc.tag << " offset " << i;
+            EXPECT_EQ(endo_res.on_curve(), true) << tc.tag << " offset " << i;
+            EXPECT_NE(endo_res, naive_res) << tc.tag << " offset " << i;
+        }
+    }
+}
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.test.cpp
index 86993e30a8ee..61bb30127c06 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.test.cpp
@@ -5,6 +5,24 @@
 
 using namespace bb;
 
+namespace {
+// Double-and-add scalar mul without endomorphism, used as reference for differential testing.
+template <typename Group, typename Fr>
+typename Group::affine_element naive_scalar_mul(const typename Group::element& base, const Fr& scalar)
+{
+    typename Group::element acc = Group::point_at_infinity;
+    typename Group::element runner = base;
+    uint256_t bits(scalar.from_montgomery_form());
+    for (size_t i = 0; i < 256; ++i) {
+        if (bits.get_bit(i)) {
+            acc = acc + runner;
+        }
+        runner = runner.dbl();
+    }
+    return typename Group::affine_element(acc);
+}
+} // namespace
+
 TEST(grumpkin, CheckB)
 {
     auto b = grumpkin::g1::curve_b;
@@ -336,3 +354,43 @@ TEST(grumpkin, CheckPrecomputedGenerators)
     ASSERT_TRUE((bb::check_precomputed_generators<grumpkin::g1, "pedersen_hash_length", 1UL>()));
     ASSERT_TRUE((bb::check_precomputed_generators<grumpkin::g1, "DEFAULT_DOMAIN_SEPARATOR", 8UL>()));
 }
+
+// Negative-k2 bug: boundary scalars k = ceil(m * 2^256 / endo_g2) (from endomorphism_scalars.py)
+// each start a band ~2^123-2^126 wide where operator* gives wrong results vs naive double-and-add.
+TEST(grumpkin, ScalarMulNegativeK2EdgeCases)
+{
+    // clang-format off
+    struct test_case { std::array<uint64_t, 4> limbs; const char* tag; };
+    const std::array<test_case, 3> boundary_cases = {{
+        {{ 0x71922da036dca5f4, 0xd970a56127fb8227, 0x59e26bcea0d48bac, 0x0 }, "m=1"},
+        {{ 0xe3245b406db94be8, 0xb2e14ac24ff7044e, 0xb3c4d79d41a91759, 0x0 }, "m=2"},
+        {{ 0x54b688e0a495f1dc, 0x8c51f02377f28676, 0x0da7436be27da306, 0x1 }, "m=3"},
+    }};
+    // clang-format on
+
+    for (const auto& tc : boundary_cases) {
+        grumpkin::fr base_scalar{ tc.limbs[0], tc.limbs[1], tc.limbs[2], tc.limbs[3] };
+        base_scalar.self_to_montgomery_form();
+
+        grumpkin::g1::affine_element endo_result(grumpkin::g1::one * base_scalar);
+        grumpkin::g1::affine_element naive_result =
+            naive_scalar_mul<grumpkin::g1, grumpkin::fr>(grumpkin::g1::one, base_scalar);
+        EXPECT_EQ(naive_result.on_curve(), true) << tc.tag;
+        EXPECT_EQ(endo_result.on_curve(), true) << tc.tag;
+        EXPECT_NE(endo_result, naive_result) << "Bug may be fixed! " << tc.tag;
+
+        // Random samples within the band (narrowest is ~2^123; we use 122-bit offsets).
+        for (size_t i = 0; i < 100; ++i) {
+            uint256_t rand_bits(grumpkin::fr::random_element().from_montgomery_form());
+            uint256_t offset_int = (rand_bits & ((uint256_t(1) << 122) - 1)) + 1;
+            grumpkin::fr scalar = base_scalar + grumpkin::fr(offset_int);
+
+            grumpkin::g1::affine_element endo_res(grumpkin::g1::one * scalar);
+            grumpkin::g1::affine_element naive_res =
+                naive_scalar_mul<grumpkin::g1, grumpkin::fr>(grumpkin::g1::one, scalar);
+            EXPECT_EQ(naive_res.on_curve(), true) << tc.tag << " offset " << i;
+            EXPECT_EQ(endo_res.on_curve(), true) << tc.tag << " offset " << i;
+            EXPECT_NE(endo_res, naive_res) << tc.tag << " offset " << i;
+        }
+    }
+}
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
new file mode 100644
index 000000000000..896826e70dc9
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python3
+"""
+BN254 GLV Endomorphism: Constants and Scalar Splitting Explained
+
+This document explains the "splitting scalars" algorithm in Barretenberg. The first section
+works explicitly with Fr, the scalar field of the BN254 elliptic curve.
+
+Reference: Gallant, Lambert, Vanstone, "Faster Point Multiplication on Elliptic Curves" (2001)
+
+NOTATION:
+    ≈   "approximately equal" - values differ by a small rounding error
+"""
+
+# ====================================================================================
+# § 1. INTRODUCTION
+# ====================================================================================
+#
+# The BN254 elliptic curve admits an efficiently computable endomorphism φ that
+# satisfies φ(P) = λ·P where λ is a cube root of unity in the scalar field Fr.
+#
+# This enables an optimization: any scalar multiplication k·P can be
+# decomposed as:
+#
+#     k·P = k1·P - k2·φ(P)
+#
+# where k1, k2 are approximately 127 bits each (half the size of the original
+# 254-bit scalar k). Since φ(P) is nearly free to compute (one Fq multiplication)
+# and the scalars are half-sized, this significantly accelerates scalar multiplication.
+#
+# This document explains:
+#   1. How the mathematical constants are derived
+#   2. How the algorithm guarantees k ≡ k1 - λ·k2 (mod r), where k1 and k2 are short (~127-bit) values.
+
+# ====================================================================================
+# § 2. FIELD PARAMETERS (for Fr)
+# ====================================================================================
+
+# The scalar field modulus of BN254 (from bn254/fr.hpp)
+r = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
+
+# Montgomery parameter: R = 2^256 mod r
+# This is needed because fr.hpp stores values in Montgomery form
+R = pow(2, 256, r)
+R_inv = pow(R, -1, r)
+
+# The cube root of unity λ ∈ Fr (from bn254/fr.hpp)
+# CRITICAL: In Barretenberg, this value is stored in Montgomery form and must be converted!
+# We maintain the montgomery form here to show that the values are compatible with those in Barretenberg.
+cube_root_montgomery = (
+    0x93e7cede4a0329b3 |
+    (0x7d4fdca77a96c167 << 64) |
+    (0x8be4ba08b19a750a << 128) |
+    (0x1cbd5653a5661c25 << 192)
+)
+
+# Convert from Montgomery form to standard form
+lambda_val = (cube_root_montgomery * R_inv) % r
+
+# Verify that λ is a non-trivial cube root of unity.
+assert (pow(lambda_val, 2, r) + lambda_val + 1) % r == 0, "λ² + λ + 1 ≡ 0 (mod r)"
+
+
+# ====================================================================================
+# § 3. THE LATTICE BASIS
+# ====================================================================================
+#
+# To decompose k into short scalars k1, k2, we use a "nearest lattice vector" style of algorithm.
+# First, consider the 2D lattice:
+#
+#     L = {(a, b) ∈ Z² : a + λ·b ≡ 0 (mod r)} = ker(Z² -> Z/rZ), (a, b) ↦ a + λ·b
+#
+# NOTE: any lattice point (a, b) ∈ L satisfies a ≡ -λ·b (mod r).
+#
+# The phrase "short basis for a lattice" means two vectors with small components. Here is
+# our short basis.
+
+a1 = 0x89d3256894d213e3                   # 64 bits
+b1 = -0x6f4d8248eeb859fc8211bbeb7d4f1128  # 127 bits (negative)
+a2 = 0x6f4d8248eeb859fd0be4e1541221250b  # 127 bits
+b2 = 0x89d3256894d213e3                   # 64 bits
+
+# Verify that the vectors are in the lattice: ai + λ·bi ≡ 0 (mod r)
+assert (a1 + lambda_val * b1) % r == 0, "Lattice vector 1 must satisfy a1 + λ·b1 ≡ 0"
+assert (a2 + lambda_val * b2) % r == 0, "Lattice vector 2 must satisfy a2 + λ·b2 ≡ 0"
+
+# Verify the determinant: det(L) = a1·b2 - a2·b1 = -r ≡ 0 (mod r)
+det = (a1 * b2 - a2 * b1)
+assert abs(det) == r, "Lattice determinant ±r; hence for our vectors to be a lattice basis, they must have the same determinant (up to sign)"
+
+# These constants will later be shown to match those in barretenberg. It is worth noting
+# that in fr.hpp, we DO NOT store a1 or a2; it turns out we ONLY need b1 and b2.
+
+# ====================================================================================
+# § 4. PRECOMPUTED CONSTANTS FOR DIVISION-FREE COMPUTATION
+# ====================================================================================
+#
+# The scalar splitting algorithm (Babai's nearest plane) requires computing:
+#
+#     c1 ≈ (k·b2) / r
+#     c2 ≈ (k·(-b1)) / r
+#
+# Division by r is expensive. Instead, we _precompute_ fixed-point representations:
+#
+#     endo_g1 = ((-b1) · 2^256) // r
+#     endo_g2 = (b2 · 2^256) // r
+#
+# Note that the above expressions, endo_g1 and endo_g2 DO NOT depend on k.
+#
+# Then: (endo_g2 · k) >> 256 ≈ (b2 · k) / r = c1
+#       (endo_g1 · k) >> 256 ≈ ((-b1) · k) / r = c2
+#
+# The ≈ hides two rounding errors: the floor in computing endo_g2 (or endo_g1),
+# and the floor in the >> 256 shift. However, the total error is at most 1.
+# So c1, c2 are each off by at most 1 from the exact rational values.
+
+
+def compute_splitting_constants(modulus, b1, b2):
+    """
+    Compute the precomputed constants for division-free scalar splitting.
+
+    Returns (endo_g1, endo_g2, endo_minus_b1, endo_b2) matching fr.hpp
+    """
+    shift = 1 << 256
+    endo_g1 = ((-b1) * shift) // modulus
+    endo_g2 = (b2 * shift) // modulus
+    endo_minus_b1 = (-b1) % modulus
+    endo_b2 = b2 % modulus
+    return endo_g1, endo_g2, endo_minus_b1, endo_b2
+
+
+endo_g1, endo_g2, endo_minus_b1, endo_b2 = compute_splitting_constants(r, b1, b2)
+
+# Verify these match the values in bn254/fr.hpp
+expected_endo_g1 = 0x7a7bd9d4391eb18d | (0x4ccef014a773d2cf << 64) | (0x2 << 128)
+expected_endo_g2 = 0xd91d232ec7e0b3d7 | (0x2 << 64)
+expected_endo_minus_b1 = 0x8211bbeb7d4f1128 | (0x6f4d8248eeb859fc << 64)
+expected_endo_b2 = 0x89d3256894d213e3
+
+assert endo_g1 == expected_endo_g1, "endo_g1 must match fr.hpp"
+assert endo_g2 == expected_endo_g2, "endo_g2 must match fr.hpp"
+assert endo_minus_b1 == expected_endo_minus_b1, "endo_minus_b1 must match fr.hpp"
+assert endo_b2 == expected_endo_b2, "endo_b2 must match fr.hpp"
+
+
+# ====================================================================================
+# § 5. THE SCALAR SPLITTING ALGORITHM
+# ====================================================================================
+#
+# Given a scalar k, we compute (k1, k2) such that:
+#     k ≡ k1 - λ·k2 (mod r),    |k1|, |k2| < 2^128
+#
+# DERIVATION (Babai's nearest plane on L):
+#
+# Find a lattice point (x, y) ∈ L close to (k, 0). Setting k1 = k - x, k2 = y,
+# the lattice condition x + λ·y ≡ 0 (mod r) gives k1 - λ·k2 ≡ k (mod r).
+#
+# Writing (x, y) = c1·(a1, b1) + c2·(a2, b2), inverting the basis matrix
+# against the target (k, 0) with det = r gives:
+#
+#     c1 = ⌊k·b2 / r⌋,    c2 = ⌊k·(-b1) / r⌋
+#
+# Note: neither a1 nor a2 appear, which is why we don't store them.
+# Note: b1 is negative, hence the second expression is again non-negative.
+#
+# BOUNDS: Let δ1, δ2 ∈ [0,1) be the rounding errors. A simple computation
+# shows that k2 = c1·b1 + c2·b2 would be zero with exact division, so
+# |k2| ≤ |δ1·b1| + |δ2·b2| < |b1| + |b2| < 2^128. Similarly |k1| < 2^128.
+#
+# WHAT THE IMPLEMENTATION COMPUTES:
+#
+# k2: The implementation stores -b1 (not b1), so it computes
+#     t1 := c2·b2 - c1·(-b1) = c1·b1 + c2·b2 = k2  (mod r)
+# Since |k2| < 2^128 < r, the mod r is a no-op.
+#
+# k1: Using the lattice relation ai ≡ -λ·bi (mod r), a simple computation
+# shows c1·a1 + c2·a2 ≡ -λ·k2 (mod r), so k1 = k + λ·k2. Hence:
+#     t2 := t1·λ + k = k1  (mod r)
+# Again |k1| < r, so the mod r is a no-op.
+#
+# ALGORITHM (field_declarations.hpp, split_into_endomorphism_scalars):
+#
+#   1. c1 ≈ (b2·k)/r      via c1 = (endo_g2 · k) >> 256
+#   2. c2 ≈ ((-b1)·k)/r   via c2 = (endo_g1 · k) >> 256
+#   3. q1 = c1 · (-b1) mod r
+#   4. q2 = c2 · b2 mod r
+#   5. t1 = (q2 - q1) mod r       = k2
+#   6. t2 = (t1 · λ + k) mod r    = k1
+#   7. Return low 128 bits of (t2, t1) as (k1, k2)
+#
+
+def split_scalar_buggy(k, modulus, beta, endo_g1, endo_g2, endo_minus_b1, endo_b2):
+    """
+    BUGGY version of the scalar splitting algorithm (before fix).
+
+    This is the original implementation that fails for ~2^{-64} of inputs.
+    Kept here to demonstrate the bug; use split_scalar() for the fixed version.
+    """
+    input = k % modulus
+
+    c1 = (endo_g2 * input) >> 256
+    c2 = (endo_g1 * input) >> 256
+
+    q1_lo = (c1 * endo_minus_b1) % modulus
+    q2_lo = (c2 * endo_b2) % modulus
+
+    t1 = (q2_lo - q1_lo) % modulus
+    t2 = (t1 * beta + input) % modulus
+
+    k2 = t1 & ((1 << 128) - 1)
+    k1 = t2 & ((1 << 128) - 1)
+
+    return k1, k2, t1, t2
+
+
+# ====================================================================================
+# § 5a. THE NEGATIVE k2 BUG AND FIX
+# ====================================================================================
+#
+# THE BUG:
+#
+# The §5 analysis claims "Since |k2| < 2^128 < r, the mod r is a no-op." This is true
+# for the MAGNITUDE of k2, but k2 can be NEGATIVE. When k2 < 0:
+#
+#     t1 = k2 mod r = k2 + r    (254 bits!)
+#
+# The 128-bit truncation {t1.data[0], t1.data[1]} then extracts garbage, and both
+# k1 and k2 are wrong.
+#
+# WHEN DOES k2 GO NEGATIVE?
+#
+# Recall k2 = c1·b1 + c2·b2 = -c1·|b1| + c2·b2, where c1 and c2 are floors of
+# rational values. Writing c1 = c1_exact - δ1, c2 = c2_exact - δ2 with δ ∈ [0,1):
+#
+#     k2 = -δ1·|b1| + δ2·b2
+#
+# This is negative when δ1·|b1| > δ2·b2. Since |b1|/b2 ≈ 2^63, this requires
+# δ1 > δ2·2^{-63} — i.e., δ1 can be tiny but must be nonzero.
+#
+# In practice, this happens at boundaries where c1 "ticks up" to a new integer m.
+# The boundary is at k ≈ m·r/b2, where c1 jumps from m-1 to m. Just above the
+# boundary, c1 = m while c2·b2 hasn't grown enough to compensate, so
+# k2 = c2·b2 - m·|b1| < 0 (with magnitude up to ~b2 ≈ 2^64).
+#
+# HOW MANY INPUTS ARE AFFECTED?
+#
+# For each of the ~b2 ≈ 2^64 values of c1, there is a contiguous range of ~2^{126}
+# values of k where k2 < 0. Total: ~2^{190} out of r ≈ 2^{254}, a fraction of ~2^{-64}.
+# This is far too rare for random testing to catch (~2^{64} samples needed), but the
+# affected inputs are deterministic and easily constructed.
+#
+# THE FIX:
+#
+# When k2 is negative, t1 = k2 + r has nonzero upper limbs (bits 128..253). We detect
+# this and add |b1| to k2, which is equivalent to decrementing c1 by 1. This shifts
+# the decomposition by the lattice vector (a1, b1):
+#
+#     k2_new = k2 + |b1|    (now positive, ~127 bits)
+#     k1_new = k1 - a1      (shifted by ~64 bits, still fits in 128 bits)
+#
+# Algebraic correctness is preserved because (a1, b1) ∈ L, so the shift satisfies
+# a1 + λ·b1 ≡ 0 (mod r), meaning k1_new - λ·k2_new ≡ k1 - λ·k2 ≡ k (mod r).
+
+def split_scalar(k, modulus, beta, endo_g1, endo_g2, endo_minus_b1, endo_b2):
+    """
+    Split scalar k into (k1, k2) such that k ≡ k1 - λ·k2 (mod r).
+
+    Implements split_into_endomorphism_scalars() in field_declarations.hpp
+    (with the negative-k2 fix applied).
+
+    Returns:
+        (k1, k2, t1, t2): The 128-bit split scalars and their full-width forms
+    """
+    input = k % modulus
+
+    # compute c1 = (g2 * k) >> 256
+    c1 = (endo_g2 * input) >> 256
+    # compute c2 = (g1 * k) >> 256
+    c2 = (endo_g1 * input) >> 256
+
+    # compute q1 = c1 * -b1
+    q1_lo = (c1 * endo_minus_b1) % modulus
+    # compute q2 = c2 * b2
+    q2_lo = (c2 * endo_b2) % modulus
+
+    t1 = (q2_lo - q1_lo) % modulus
+
+    # FIX: k2 (= t1) can be slightly negative for ~2^{-64} of inputs.
+    # When negative, t1 = k2 + r is 254 bits (upper limbs nonzero in C++).
+    # Detect this and add |b1| to shift along the lattice vector (a1, b1),
+    # making k2 positive while keeping both scalars within 128 bits.
+    if t1.bit_length() > 128:
+        t1 = (t1 + endo_minus_b1) % modulus
+
+    t2 = (t1 * beta + input) % modulus
+
+    # Truncate to 128 bits (as done in C++ implementation)
+    k2 = t1 & ((1 << 128) - 1)
+    k1 = t2 & ((1 << 128) - 1)
+
+    return k1, k2, t1, t2
+
+
+# ====================================================================================
+# § 6. VERIFICATION
+# ====================================================================================
+#
+# We verify the algorithm on several test cases, including edge cases.
+
+def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
+    """Verify correctness and bounds of the scalar split."""
+    reconstructed = (k1 - lambda_val * k2) % modulus
+    assert reconstructed == k % modulus, f"k ≡ k1 - λ·k2 failed for k={k}"
+    assert t1.bit_length() <= 128, f"t1 has {t1.bit_length()} bits (> 128) for k={k}"
+    assert t2.bit_length() <= 128, f"t2 has {t2.bit_length()} bits (> 128) for k={k}"
+
+for k_test in [0, 1, 42, lambda_val, r - 1]:
+    k1, k2, t1, t2 = split_scalar(k_test, r, lambda_val, endo_g1, endo_g2, endo_minus_b1, endo_b2)
+    verify_split(k_test, k1, k2, t1, t2, lambda_val, r)
+
+
+# § 6a. Verify the fix on concrete negative-k2 trigger inputs.
+#
+# These are k = ceil(m * 2^256 / endo_g2) for m = 1, 2, 3 — the smallest k values
+# where c1 ticks up to m. The buggy version fails; the fixed version must pass.
+for m in [1, 2, 3]:
+    k_trigger = (m * (1 << 256) + endo_g2 - 1) // endo_g2
+    assert k_trigger < r, f"trigger input must be < r for m={m}"
+
+    # Buggy version fails (t1 > 128 bits due to negative k2)
+    _, _, t1_buggy, _ = split_scalar_buggy(k_trigger, r, lambda_val, endo_g1, endo_g2, endo_minus_b1, endo_b2)
+    assert t1_buggy.bit_length() > 128, f"Expected buggy t1 > 128 bits for m={m}, got {t1_buggy.bit_length()}"
+
+    # Fixed version passes
+    k1, k2, t1, t2 = split_scalar(k_trigger, r, lambda_val, endo_g1, endo_g2, endo_minus_b1, endo_b2)
+    verify_split(k_trigger, k1, k2, t1, t2, lambda_val, r)
+
+
+# ====================================================================================
+# § 7. SUMMARY
+# ====================================================================================
+#
+# The GLV endomorphism optimization for BN254:
+#
+# 1. The BN254 curve has an endomorphism φ(x,y) = (β·x, y) satisfying φ(P) = λ·P
+# 2. We can decompose any scalar k as k ≡ k1 - λ·k2 (mod r) with k1, k2 ≈ 127 bits
+# 3. The decomposition uses Babai's nearest plane algorithm on a short lattice basis
+# 4. Precomputed constants enable division-free computation via fixed-point arithmetic
+# 5. The algorithm guarantees both k1 and k2 fit in 128 bits (actually ≤ 127 bits)
+#
+# Performance: Instead of one 254-bit scalar multiplication k·P, we compute
+# k1·P - k2·φ(P) with two ~127-bit scalars processed via interleaved WNAF,
+# reducing the number of doublings by roughly half.
+#
+# References:
+#   • Gallant, Lambert, Vanstone: "Faster Point Multiplication on Elliptic Curves
+#     with Efficient Endomorphisms", CRYPTO 2001
+#   • https://www.iacr.org/archive/crypto2001/21390189.pdf
+
+if __name__ == "__main__":
+    print("BN254 GLV Endomorphism - All verifications passed!")
+    print(f"  λ (cube root of unity): {hex(lambda_val)}")
+    print(f"  endo_g1: {hex(endo_g1)}")
+    print(f"  endo_g2: {hex(endo_g2)}")
+    print(f"  endo_minus_b1: {hex(endo_minus_b1)}")
+    print(f"  endo_b2: {hex(endo_b2)}")
+    print("\nConstants match barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.hpp")

From ce106a217ed28fa5b247498c9503daec0792cbd9 Mon Sep 17 00:00:00 2001
From: notnotraju <raju@aztec-labs.com>
Date: Thu, 19 Feb 2026 13:28:20 +0000
Subject: [PATCH 2/8] fixed the splitting scalars issue, modified the python
 documentation and changed the tests to be regression tests accordingly.

---
 .../barretenberg/ecc/curves/bn254/fq.test.cpp |   6 +-
 .../barretenberg/ecc/curves/bn254/fr.test.cpp | 140 ++++++++++++++++-
 .../barretenberg/ecc/curves/bn254/g1.test.cpp |  17 +-
 .../ecc/curves/grumpkin/grumpkin.test.cpp     |  17 +-
 .../ecc/fields/endomorphism_scalars.py        | 145 +++++++-----------
 .../ecc/fields/field_declarations.hpp         |  10 ++
 6 files changed, 226 insertions(+), 109 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp
index 171685751fd5..d14d202c6663 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp
@@ -130,8 +130,8 @@ TEST(BN254Fq, SplitIntoEndomorphismScalarsSimple)
     }
 }
 
-// Negative-k2 bug: k = ceil(m * 2^256 / endo_g2) produces negative k2 in the GLV splitting,
-// and the 128-bit truncation extracts the wrong value. See endomorphism_scalars.py for derivation.
+// Regression: k = ceil(m * 2^256 / endo_g2), for m an integer, previously produced negative k2 in the GLV
+// splitting, causing 128-bit truncation to extract wrong values. See endomorphism_scalars.py.
 TEST(BN254Fq, SplitEndomorphismNegativeK2)
 {
     // clang-format off
@@ -157,7 +157,7 @@ TEST(BN254Fq, SplitEndomorphismNegativeK2)
         fq result = k1 - k2 * lambda;
         result.self_from_montgomery_form();
 
-        EXPECT_NE(result, k) << "Bug may be fixed! " << tc.tag;
+        EXPECT_EQ(result, k) << tc.tag;
     }
 }
 
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp
index 4f81411669c4..1ac3c3021df8 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp
@@ -1,10 +1,128 @@
+/**
+ * @brief BN254 scalar field (fr) specific tests.
+ *
+ * Other field arithmetic tests (both compile-time and runtime) are in ecc/fields/generic_field.test.cpp and
+ * ecc/fields/prime_field.test.cpp. This file contains only BN254 scalar field specific functionality:
+ * - Fixed compile-time tests with field-specific expected values
+ * - Endomorphism scalar decomposition
+ */
+
 #include "fr.hpp"
+#include "barretenberg/numeric/random/engine.hpp"
 #include <gtest/gtest.h>
 
 using namespace bb;
 
-// Negative-k2 bug: k = ceil(m * 2^256 / endo_g2) produces negative k2 in the GLV splitting,
-// and the 128-bit truncation extracts the wrong value. See endomorphism_scalars.py for derivation.
+namespace {
+auto& engine = numeric::get_debug_randomness();
+} // namespace
+
+// ================================
+// Fixed Compile-Time Tests (field-specific expected values)
+// These tests use hardcoded expected values that are only valid for native builds (R = 2^256).
+// WASM uses R = 2^261.
+// ================================
+
+#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
+TEST(BN254Fr, CompileTimeMultiplication)
+{
+    constexpr fr a{ 0x20565a572c565a66, 0x7bccd0f01f5f7bff, 0x63ec2beaad64711f, 0x624953caaf44a814 };
+    constexpr fr b{ 0xa17307a2108adeea, 0x74629976c14c5e2b, 0x9ce6f072ab1740ee, 0x398c753702b2bef0 };
+    constexpr fr expected{ 0xe8cdd06343386834, 0x8cbb3f556258a9af, 0x5aef2f34f2d66fd4, 0x2d8263c7e10213ca };
+
+    constexpr fr result = a * b;
+    static_assert(result == expected);
+}
+
+TEST(BN254Fr, CompileTimeSquaring)
+{
+    constexpr fr a{ 0x20565a572c565a66, 0x7bccd0f01f5f7bff, 0x63ec2beaad64711f, 0x624953caaf44a814 };
+    constexpr fr expected{ 0x3e928bdb06267b99, 0x1e5834571f52dfbf, 0x3d63bdf9bf7d0d4b, 0x353bb31adaa033c7 };
+
+    constexpr fr result = a.sqr();
+    static_assert(result == expected);
+}
+
+TEST(BN254Fr, CompileTimeAddition)
+{
+    constexpr fr a{ 0x20565a572c565a66, 0x7bccd0f01f5f7bff, 0x63ec2beaad64711f, 0x624953caaf44a814 };
+    constexpr fr b{ 0xa17307a2108adeea, 0x74629976c14c5e2b, 0x9ce6f072ab1740ee, 0x398c753702b2bef0 };
+    constexpr fr expected{ 0x3a0576d15ce1394e, 0x9fc799d5ed38f908, 0x903290f055790153, 0x3b0d2c1bef9426b1 };
+
+    constexpr fr result = a + b;
+    static_assert(result == expected);
+}
+
+TEST(BN254Fr, CompileTimeSubtraction)
+{
+    constexpr fr a{ 0xcfbcfcf457cf2d38, 0x7b27af26ce62aa61, 0xf0378e90d48f2b92, 0x4734b22cb21ded };
+    constexpr fr b{ 0x569fdb1db5198770, 0x446ddccef8347d52, 0xef215227182d22a, 0x8281b4fb109306 };
+    constexpr fr expected{ 0xe10cfe82b5a5ca, 0x8721a2e8c9a10e32, 0x51e604db660f0a22, 0x608d4fe2f404cb3b };
+
+    constexpr fr result = a - b;
+    static_assert(result == expected);
+}
+#endif
+
+TEST(BN254Fr, CompileTimeInversion)
+{
+    constexpr fr a{ 0x20565a572c565a66, 0x7bccd0f01f5f7bff, 0x63ec2beaad64711f, 0x624953caaf44a814 };
+    constexpr fr inv = a.invert();
+    // Verify a * a^-1 = 1
+    static_assert(a * inv == fr::one());
+}
+
+// ================================
+// BN254 Scalar Field Specific
+// ================================
+
+TEST(BN254Fr, SplitIntoEndomorphismScalars)
+{
+    fr k = fr::random_element();
+    fr k1 = { 0, 0, 0, 0 };
+    fr k2 = { 0, 0, 0, 0 };
+
+    fr::split_into_endomorphism_scalars(k, k1, k2);
+
+    fr result{ 0, 0, 0, 0 };
+
+    k1.self_to_montgomery_form();
+    k2.self_to_montgomery_form();
+
+    fr lambda = fr::cube_root_of_unity();
+    result = k2 * lambda;
+    result = k1 - result;
+
+    result.self_from_montgomery_form();
+    EXPECT_EQ(result, k);
+}
+
+TEST(BN254Fr, SplitIntoEndomorphismScalarsSimple)
+{
+    fr input = { 1, 0, 0, 0 };
+    fr k = { 0, 0, 0, 0 };
+    fr k1 = { 0, 0, 0, 0 };
+    fr k2 = { 0, 0, 0, 0 };
+    fr::__copy(input, k);
+
+    fr::split_into_endomorphism_scalars(k, k1, k2);
+
+    fr result{ 0, 0, 0, 0 };
+    k1.self_to_montgomery_form();
+    k2.self_to_montgomery_form();
+
+    fr lambda = fr::cube_root_of_unity();
+    result = k2 * lambda;
+    result = k1 - result;
+
+    result.self_from_montgomery_form();
+    for (size_t i = 0; i < 4; ++i) {
+        EXPECT_EQ(result.data[i], k.data[i]);
+    }
+}
+
+// Regression: k = ceil(m * 2^256 / endo_g2), for m an integer, previously produced negative k2 in the GLV
+// splitting, causing 128-bit truncation to extract wrong values.
 TEST(BN254Fr, SplitEndomorphismNegativeK2)
 {
     // clang-format off
@@ -30,6 +148,22 @@ TEST(BN254Fr, SplitEndomorphismNegativeK2)
         fr result = k1 - k2 * lambda;
         result.self_from_montgomery_form();
 
-        EXPECT_NE(result, k) << "Bug may be fixed! " << tc.tag;
+        EXPECT_EQ(result, k) << tc.tag;
     }
 }
+
+// ================================
+// Regression / Optimization Tests
+// ================================
+
+// Tests that (lo + 2^256 * hi) mod r == ((lo|hi) % r) in uint512_t
+// This validates the optimization of avoiding slow uint512_t modulo
+TEST(BN254Fr, EquivalentRandomness)
+{
+    uint512_t random_uint512 = engine.get_random_uint512();
+    auto random_lo = fr(random_uint512.lo);
+    auto random_hi = fr(random_uint512.hi);
+    uint512_t r(fr::modulus);
+    constexpr auto pow_2_256 = fr(uint256_t(1) << 128).sqr();
+    EXPECT_EQ(random_lo + pow_2_256 * random_hi, fr((random_uint512 % r).lo));
+}
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/g1.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/g1.test.cpp
index d1b3ca1e60ab..ea3651b8a83c 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/g1.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/bn254/g1.test.cpp
@@ -12,7 +12,7 @@ typename Group::affine_element naive_scalar_mul(const typename Group::element& b
 {
     typename Group::element acc = Group::point_at_infinity;
     typename Group::element runner = base;
-    uint256_t bits(scalar.from_montgomery_form());
+    uint256_t bits(scalar);
     for (size_t i = 0; i < 256; ++i) {
         if (bits.get_bit(i)) {
             acc = acc + runner;
@@ -447,9 +447,10 @@ TEST(g1, CheckPrecomputedGenerators)
     ASSERT_TRUE((bb::check_precomputed_generators<g1, "test generators", 2UL>()));
 }
 
-// Negative-k2 bug: boundary scalars k = ceil(m * 2^256 / endo_g2) (from endomorphism_scalars.py)
-// each start a band ~2^123-2^126 wide where operator* gives wrong results vs naive double-and-add.
-TEST(g1, ScalarMulNegativeK2EdgeCases)
+// Regression: boundary scalars k = ceil(m * 2^256 / endo_g2) (from endomorphism_scalars.py)
+// previously triggered the negative-k2 bug in split_into_endomorphism_scalars, producing wrong
+// scalar multiplication results. We test boundaries and random samples within each band.
+TEST(g1, ScalarMulNegativeK2Regression)
 {
     // clang-format off
     struct test_case { std::array<uint64_t, 4> limbs; const char* tag; };
@@ -468,11 +469,11 @@ TEST(g1, ScalarMulNegativeK2EdgeCases)
         g1::affine_element naive_result = naive_scalar_mul<g1, fr>(g1::one, base_scalar);
         EXPECT_EQ(naive_result.on_curve(), true) << tc.tag;
         EXPECT_EQ(endo_result.on_curve(), true) << tc.tag;
-        EXPECT_NE(endo_result, naive_result) << "Bug may be fixed! " << tc.tag;
+        EXPECT_EQ(endo_result, naive_result) << tc.tag;
 
-        // Random samples within the band (narrowest is ~2^123; we use 122-bit offsets).
+        // Random samples within the formerly-buggy band (~2^123-2^126 wide; 122-bit offsets).
         for (size_t i = 0; i < 100; ++i) {
-            uint256_t rand_bits(fr::random_element().from_montgomery_form());
+            uint256_t rand_bits(fr::random_element());
             uint256_t offset_int = (rand_bits & ((uint256_t(1) << 122) - 1)) + 1;
             fr scalar = base_scalar + fr(offset_int);
 
@@ -480,7 +481,7 @@ TEST(g1, ScalarMulNegativeK2EdgeCases)
             g1::affine_element naive_res = naive_scalar_mul<g1, fr>(g1::one, scalar);
             EXPECT_EQ(naive_res.on_curve(), true) << tc.tag << " offset " << i;
             EXPECT_EQ(endo_res.on_curve(), true) << tc.tag << " offset " << i;
-            EXPECT_NE(endo_res, naive_res) << tc.tag << " offset " << i;
+            EXPECT_EQ(endo_res, naive_res) << tc.tag << " offset " << i;
         }
     }
 }
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.test.cpp
index 61bb30127c06..9cb34c57740a 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.test.cpp
@@ -12,7 +12,7 @@ typename Group::affine_element naive_scalar_mul(const typename Group::element& b
 {
     typename Group::element acc = Group::point_at_infinity;
     typename Group::element runner = base;
-    uint256_t bits(scalar.from_montgomery_form());
+    uint256_t bits(scalar);
     for (size_t i = 0; i < 256; ++i) {
         if (bits.get_bit(i)) {
             acc = acc + runner;
@@ -355,9 +355,10 @@ TEST(grumpkin, CheckPrecomputedGenerators)
     ASSERT_TRUE((bb::check_precomputed_generators<grumpkin::g1, "DEFAULT_DOMAIN_SEPARATOR", 8UL>()));
 }
 
-// Negative-k2 bug: boundary scalars k = ceil(m * 2^256 / endo_g2) (from endomorphism_scalars.py)
-// each start a band ~2^123-2^126 wide where operator* gives wrong results vs naive double-and-add.
-TEST(grumpkin, ScalarMulNegativeK2EdgeCases)
+// Regression: boundary scalars k = ceil(m * 2^256 / endo_g2) (from endomorphism_scalars.py)
+// previously triggered the negative-k2 bug in split_into_endomorphism_scalars, producing wrong
+// scalar multiplication results. We test boundaries and random samples within each band.
+TEST(grumpkin, ScalarMulNegativeK2Regression)
 {
     // clang-format off
     struct test_case { std::array<uint64_t, 4> limbs; const char* tag; };
@@ -377,11 +378,11 @@ TEST(grumpkin, ScalarMulNegativeK2EdgeCases)
             naive_scalar_mul<grumpkin::g1, grumpkin::fr>(grumpkin::g1::one, base_scalar);
         EXPECT_EQ(naive_result.on_curve(), true) << tc.tag;
         EXPECT_EQ(endo_result.on_curve(), true) << tc.tag;
-        EXPECT_NE(endo_result, naive_result) << "Bug may be fixed! " << tc.tag;
+        EXPECT_EQ(endo_result, naive_result) << tc.tag;
 
-        // Random samples within the band (narrowest is ~2^123; we use 122-bit offsets).
+        // Random samples within the formerly-buggy band (~2^123-2^126 wide; 122-bit offsets).
         for (size_t i = 0; i < 100; ++i) {
-            uint256_t rand_bits(grumpkin::fr::random_element().from_montgomery_form());
+            uint256_t rand_bits(grumpkin::fr::random_element());
             uint256_t offset_int = (rand_bits & ((uint256_t(1) << 122) - 1)) + 1;
             grumpkin::fr scalar = base_scalar + grumpkin::fr(offset_int);
 
@@ -390,7 +391,7 @@ TEST(grumpkin, ScalarMulNegativeK2EdgeCases)
                 naive_scalar_mul<grumpkin::g1, grumpkin::fr>(grumpkin::g1::one, scalar);
             EXPECT_EQ(naive_res.on_curve(), true) << tc.tag << " offset " << i;
             EXPECT_EQ(endo_res.on_curve(), true) << tc.tag << " offset " << i;
-            EXPECT_NE(endo_res, naive_res) << tc.tag << " offset " << i;
+            EXPECT_EQ(endo_res, naive_res) << tc.tag << " offset " << i;
         }
     }
 }
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
index 896826e70dc9..2831e3085eb0 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
@@ -79,6 +79,8 @@
 a2 = 0x6f4d8248eeb859fd0be4e1541221250b  # 127 bits
 b2 = 0x89d3256894d213e3                   # 64 bits
 
+# NOTE: a remarkable feature of this short basis is that a1 == b2, and indeed -b1 is rather close to a2.
+
 # Verify that the vectors are in the lattice: ai + λ·bi ≡ 0 (mod r)
 assert (a1 + lambda_val * b1) % r == 0, "Lattice vector 1 must satisfy a1 + λ·b1 ≡ 0"
 assert (a2 + lambda_val * b2) % r == 0, "Lattice vector 2 must satisfy a2 + λ·b2 ≡ 0"
@@ -161,111 +163,68 @@ def compute_splitting_constants(modulus, b1, b2):
 #
 # Note: neither a1 nor a2 appear, which is why we don't store them.
 # Note: b1 is negative, hence the second expression is again non-negative.
+# Note: we can think of (k1, k2) as the "error term" to a lattice approximation of (k, 0).
 #
-# BOUNDS: Let δ1, δ2 ∈ [0,1) be the rounding errors. A simple computation
+# APPROXIMATE BOUNDS: Let δ1, δ2 ∈ [0,1) be the rounding errors. A simple computation
 # shows that k2 = c1·b1 + c2·b2 would be zero with exact division, so
 # |k2| ≤ |δ1·b1| + |δ2·b2| < |b1| + |b2| < 2^128. Similarly |k1| < 2^128.
 #
-# WHAT THE IMPLEMENTATION COMPUTES:
+# WHAT THE NAIVE IMPLEMENTATION COMPUTES:
 #
 # k2: The implementation stores -b1 (not b1), so it computes
-#     t1 := c2·b2 - c1·(-b1) = c1·b1 + c2·b2 = k2  (mod r)
-# Since |k2| < 2^128 < r, the mod r is a no-op.
+#     c2·b2 - c1·(-b1) = c1·b1 + c2·b2 = k2  (mod r)
 #
 # k1: Using the lattice relation ai ≡ -λ·bi (mod r), a simple computation
 # shows c1·a1 + c2·a2 ≡ -λ·k2 (mod r), so k1 = k + λ·k2. Hence:
-#     t2 := t1·λ + k = k1  (mod r)
-# Again |k1| < r, so the mod r is a no-op.
+#     k2·λ + k = k1  (mod r)
 #
-# ALGORITHM (field_declarations.hpp, split_into_endomorphism_scalars):
+# SUBTLETY — k2 CAN BE NEGATIVE:
 #
-#   1. c1 ≈ (b2·k)/r      via c1 = (endo_g2 · k) >> 256
-#   2. c2 ≈ ((-b1)·k)/r   via c2 = (endo_g1 · k) >> 256
-#   3. q1 = c1 · (-b1) mod r
-#   4. q2 = c2 · b2 mod r
-#   5. t1 = (q2 - q1) mod r       = k2
-#   6. t2 = (t1 · λ + k) mod r    = k1
-#   7. Return low 128 bits of (t2, t1) as (k1, k2)
+# The bound |k2| < 2^128 guarantees the MAGNITUDE fits in 128 bits, but k2
+# can be negative. When k2 < 0, the modular reduction gives t1 = k2 + r,
+# which is ~254 bits. The 128-bit truncation then extracts garbage.
 #
-
-def split_scalar_buggy(k, modulus, beta, endo_g1, endo_g2, endo_minus_b1, endo_b2):
-    """
-    BUGGY version of the scalar splitting algorithm (before fix).
-
-    This is the original implementation that fails for ~2^{-64} of inputs.
-    Kept here to demonstrate the bug; use split_scalar() for the fixed version.
-    """
-    input = k % modulus
-
-    c1 = (endo_g2 * input) >> 256
-    c2 = (endo_g1 * input) >> 256
-
-    q1_lo = (c1 * endo_minus_b1) % modulus
-    q2_lo = (c2 * endo_b2) % modulus
-
-    t1 = (q2_lo - q1_lo) % modulus
-    t2 = (t1 * beta + input) % modulus
-
-    k2 = t1 & ((1 << 128) - 1)
-    k1 = t2 & ((1 << 128) - 1)
-
-    return k1, k2, t1, t2
-
-
-# ====================================================================================
-# § 5a. THE NEGATIVE k2 BUG AND FIX
-# ====================================================================================
-#
-# THE BUG:
-#
-# The §5 analysis claims "Since |k2| < 2^128 < r, the mod r is a no-op." This is true
-# for the MAGNITUDE of k2, but k2 can be NEGATIVE. When k2 < 0:
-#
-#     t1 = k2 mod r = k2 + r    (254 bits!)
-#
-# The 128-bit truncation {t1.data[0], t1.data[1]} then extracts garbage, and both
-# k1 and k2 are wrong.
-#
-# WHEN DOES k2 GO NEGATIVE?
-#
-# Recall k2 = c1·b1 + c2·b2 = -c1·|b1| + c2·b2, where c1 and c2 are floors of
-# rational values. Writing c1 = c1_exact - δ1, c2 = c2_exact - δ2 with δ ∈ [0,1):
+# Recall k2 = -c1·|b1| + c2·b2, where c1 and c2 are floors of rational
+# values. Writing c1 = c1_exact - δ1, c2 = c2_exact - δ2 with δ ∈ [0,1):
 #
 #     k2 = -δ1·|b1| + δ2·b2
 #
-# This is negative when δ1·|b1| > δ2·b2. Since |b1|/b2 ≈ 2^63, this requires
+# This is negative when δ1·|b1| > δ2·b2. Since |b1|/b2 ≈ 2^63, this needs
 # δ1 > δ2·2^{-63} — i.e., δ1 can be tiny but must be nonzero.
 #
-# In practice, this happens at boundaries where c1 "ticks up" to a new integer m.
-# The boundary is at k ≈ m·r/b2, where c1 jumps from m-1 to m. Just above the
-# boundary, c1 = m while c2·b2 hasn't grown enough to compensate, so
-# k2 = c2·b2 - m·|b1| < 0 (with magnitude up to ~b2 ≈ 2^64).
+# This happens at boundaries where c1 "ticks up" to a new integer m: at
+# k ≈ ceil(m · 2^256 / endo_g2), c1 jumps to m while c2·b2 hasn't grown
+# enough to compensate, so k2 = c2·b2 - m·|b1| < 0.
 #
-# HOW MANY INPUTS ARE AFFECTED?
+# Frequency: for each of the ~b2 ≈ 2^64 values of c1, there is a contiguous
+# range of ~2^{126} affected k values. Total: ~2^{190} / 2^{254} ≈ 2^{-64}
+# fraction. Far too rare for random testing, but easily constructed.
 #
-# For each of the ~b2 ≈ 2^64 values of c1, there is a contiguous range of ~2^{126}
-# values of k where k2 < 0. Total: ~2^{190} out of r ≈ 2^{254}, a fraction of ~2^{-64}.
-# This is far too rare for random testing to catch (~2^{64} samples needed), but the
-# affected inputs are deterministic and easily constructed.
+# FIX: When t1 has bits above position 128 (in C++: t1.data[2] or t1.data[3]
+# nonzero), we add |b1| to t1. This is equivalent to decrementing c1 by 1,
+# shifting the decomposition by the lattice vector (a1, b1). In other words
+# we change our "close lattice vector":
 #
-# THE FIX:
+#     k2_new = k2 + |b1|    (now positive, ~127 bits)
+#     k1_new = k1 - a1      (shifted down by ~64 bits)
 #
-# When k2 is negative, t1 = k2 + r has nonzero upper limbs (bits 128..253). We detect
-# this and add |b1| to k2, which is equivalent to decrementing c1 by 1. This shifts
-# the decomposition by the lattice vector (a1, b1):
+# ALGORITHM (field_declarations.hpp, split_into_endomorphism_scalars):
 #
-#     k2_new = k2 + |b1|    (now positive, ~127 bits)
-#     k1_new = k1 - a1      (shifted by ~64 bits, still fits in 128 bits)
+#   1. c1 ≈ (b2·k)/r      via c1 = (endo_g2 · k) >> 256
+#   2. c2 ≈ ((-b1)·k)/r   via c2 = (endo_g1 · k) >> 256
+#   3. q1 = c1 · (-b1)     (low 256 bits of 512-bit product)
+#   4. q2 = c2 · b2        (low 256 bits of 512-bit product)
+#   5. t1 = (q2 - q1).reduce_once()                           = k2 (mod r)
+#   6. if t1 > 128 bits: t1 = (t1 + endo_minus_b1).reduce_once()  [negative k2 fix]
+#   7. t2 = (t1 · λ + k).reduce_once()                        = k1 (mod r)
+#   8. Return low 128 bits of (t2, t1) as (k1, k2)
 #
-# Algebraic correctness is preserved because (a1, b1) ∈ L, so the shift satisfies
-# a1 + λ·b1 ≡ 0 (mod r), meaning k1_new - λ·k2_new ≡ k1 - λ·k2 ≡ k (mod r).
 
 def split_scalar(k, modulus, beta, endo_g1, endo_g2, endo_minus_b1, endo_b2):
     """
     Split scalar k into (k1, k2) such that k ≡ k1 - λ·k2 (mod r).
 
-    Implements split_into_endomorphism_scalars() in field_declarations.hpp
-    (with the negative-k2 fix applied).
+    Implements split_into_endomorphism_scalars() in field_declarations.hpp.
 
     Returns:
         (k1, k2, t1, t2): The 128-bit split scalars and their full-width forms
@@ -284,10 +243,10 @@ def split_scalar(k, modulus, beta, endo_g1, endo_g2, endo_minus_b1, endo_b2):
 
     t1 = (q2_lo - q1_lo) % modulus
 
-    # FIX: k2 (= t1) can be slightly negative for ~2^{-64} of inputs.
+    # Negative-k2 fix: k2 (= t1) can be slightly negative for ~2^{-64} of inputs.
     # When negative, t1 = k2 + r is 254 bits (upper limbs nonzero in C++).
-    # Detect this and add |b1| to shift along the lattice vector (a1, b1),
-    # making k2 positive while keeping both scalars within 128 bits.
+    # Adding |b1| shifts along the lattice vector (a1, b1), making k2 positive.
+    # In C++: if (t1.data[2] != 0 || t1.data[3] != 0)
     if t1.bit_length() > 128:
         t1 = (t1 + endo_minus_b1) % modulus
 
@@ -318,19 +277,29 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
     verify_split(k_test, k1, k2, t1, t2, lambda_val, r)
 
 
-# § 6a. Verify the fix on concrete negative-k2 trigger inputs.
+# § 6a. Verify the negative-k2 fix on concrete trigger inputs.
 #
 # These are k = ceil(m * 2^256 / endo_g2) for m = 1, 2, 3 — the smallest k values
-# where c1 ticks up to m. The buggy version fails; the fixed version must pass.
+# where c1 ticks up to m. Without the fix, t1 would be > 128 bits (negative k2
+# wraps around mod r to ~254 bits). The fix brings t1 back within 128 bits.
 for m in [1, 2, 3]:
     k_trigger = (m * (1 << 256) + endo_g2 - 1) // endo_g2
     assert k_trigger < r, f"trigger input must be < r for m={m}"
 
-    # Buggy version fails (t1 > 128 bits due to negative k2)
-    _, _, t1_buggy, _ = split_scalar_buggy(k_trigger, r, lambda_val, endo_g1, endo_g2, endo_minus_b1, endo_b2)
-    assert t1_buggy.bit_length() > 128, f"Expected buggy t1 > 128 bits for m={m}, got {t1_buggy.bit_length()}"
-
-    # Fixed version passes
+    # Show that the raw (pre-fix) t1 would be > 128 bits for these inputs:
+    # compute t1_raw without the fix to demonstrate the problem
+    inp = k_trigger % r
+    c1_raw = (endo_g2 * inp) >> 256
+    c2_raw = (endo_g1 * inp) >> 256
+    q1_raw = (c1_raw * endo_minus_b1) % r
+    q2_raw = (c2_raw * endo_b2) % r
+    t1_raw = (q2_raw - q1_raw) % r
+    assert t1_raw.bit_length() > 128, (
+        f"Expected raw t1 > 128 bits for m={m}, got {t1_raw.bit_length()} — "
+        f"this input should trigger the negative-k2 case"
+    )
+
+    # The actual algorithm (with fix) must produce valid 128-bit scalars
     k1, k2, t1, t2 = split_scalar(k_trigger, r, lambda_val, endo_g1, endo_g2, endo_minus_b1, endo_b2)
     verify_split(k_trigger, k1, k2, t1, t2, lambda_val, r)
 
@@ -346,6 +315,8 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 # 3. The decomposition uses Babai's nearest plane algorithm on a short lattice basis
 # 4. Precomputed constants enable division-free computation via fixed-point arithmetic
 # 5. The algorithm guarantees both k1 and k2 fit in 128 bits (actually ≤ 127 bits)
+# 6. For ~2^{-64} of inputs, k2 is slightly negative; the fix detects this (upper
+#    limbs nonzero) and shifts along a lattice vector to make k2 positive
 #
 # Performance: Instead of one 254-bit scalar multiplication k·P, we compute
 # k1·P - k2·φ(P) with two ~127-bit scalars processed via interleaved WNAF,
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
index e396578a9053..66872b24ddd7 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
@@ -496,6 +496,16 @@ template <class Params_> struct alignas(32) field {
         field q2_lo{ q2.data[0], q2.data[1], q2.data[2], q2.data[3] };
 
         field t1 = (q2_lo - q1_lo).reduce_once();
+
+        // k2 (= t1) can be slightly negative for ~2^{-64} of inputs.
+        // When negative, t1 = k2 + r is 254 bits (upper limbs nonzero).
+        // Fix: decrement c1 by 1, equivalent to adding |b1| to k2.
+        // This shifts k2 by +|b1| (~127 bits, now positive) and k1 by -a1 (~64 bits),
+        // keeping both within 128 bits. See endomorphism_scalars.py for more details.
+        if (t1.data[2] != 0 || t1.data[3] != 0) {
+            t1 = (t1 + endo_minus_b1).reduce_once();
+        }
+
         field beta = cube_root_of_unity();
         field t2 = (t1 * beta + input).reduce_once();
         return {

From 188e09295f13813a26fba8afd75a7f49f8f4973a Mon Sep 17 00:00:00 2001
From: notnotraju <raju@aztec-labs.com>
Date: Fri, 20 Feb 2026 10:25:55 +0000
Subject: [PATCH 3/8] added biggroup_goblin regression test

---
 .../biggroup/biggroup_goblin.test.cpp         | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_goblin.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_goblin.test.cpp
index 30e86b9cbe6a..0814ef69b5a6 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_goblin.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_goblin.test.cpp
@@ -154,6 +154,60 @@ template <typename Curve> class stdlib_biggroup_goblin : public testing::Test {
         EXPECT_CIRCUIT_CORRECTNESS(builder);
     }
 
+    /**
+     * @brief Regression test: negative-k2 edge-case scalar through the stdlib biggroup path.
+     * @details The naive GLV endomorphism splitting can produce a negative k2 for ~2^{-64} of inputs.
+     * Before the fix in split_into_endomorphism_scalars, this caused the op queue to store garbage
+     * z1/z2 values (254-bit instead of 128-bit). The stdlib biggroup `batch_mul` adds the constraint
+     * `scalar.assert_equal(z_1 - z_2 * beta)`, which catches the mismatch at the Mega circuit level.
+     *
+     * See ecc/fields/endomorphism_scalars.py for an analysis.
+     */
+    static void test_endomorphism_negative_k2_regression()
+    {
+        // clang-format off
+        // Boundary scalars k = ceil(m * 2^256 / endo_g2) from endomorphism_scalars.py.
+        // These are the smallest scalars where c1 ticks up, making k2 negative.
+        const std::array<std::array<uint64_t, 4>, 3> boundary_cases = {{
+            {{ 0x01624731e1195570, 0x3ba491482db4da14, 0x59e26bcea0d48bac, 0x0 }}, // m=1
+            {{ 0x02c48e63c232aadf, 0x774922905b69b428, 0xb3c4d79d41a91758, 0x0 }}, // m=2
+            {{ 0x0426d595a34c004e, 0xb2edb3d8891e8e3c, 0x0da7436be27da304, 0x1 }}, // m=3
+        }};
+        // clang-format on
+
+        for (const auto& limbs : boundary_cases) {
+            fr base_scalar(uint256_t{ limbs[0], limbs[1], limbs[2], limbs[3] });
+
+            // The negative-k2 band extends ~2^{123}-2^{126} above each boundary scalar.
+            // A random 122-bit positive perturbation lands inside this band, where the
+            // original (unfixed) k2 is still negative. We therefore test two scalars: the original boundary case and a
+            // 122-bit perturbation.
+            uint256_t rand_bits(fr::random_element());
+            uint256_t offset = rand_bits & ((uint256_t(1) << 122) - 1);
+            std::array<fr, 2> scalars = { base_scalar, base_scalar + fr(offset) };
+
+            // Test via batch_mul
+            for (const auto& scalar : scalars) {
+                Builder builder;
+                element_ct pt = element_ct::from_witness(&builder, affine_element::one());
+                scalar_ct sc = scalar_ct::from_witness(&builder, scalar);
+                element_ct result = element_ct::batch_mul({ pt }, { sc });
+                (void)result;
+                EXPECT_CIRCUIT_CORRECTNESS(builder);
+            }
+
+            // Test via operator* (delegates to batch_mul)
+            for (const auto& scalar : scalars) {
+                Builder builder;
+                element_ct pt = element_ct::from_witness(&builder, affine_element::one());
+                scalar_ct sc = scalar_ct::from_witness(&builder, scalar);
+                element_ct result = pt * sc;
+                (void)result;
+                EXPECT_CIRCUIT_CORRECTNESS(builder);
+            }
+        }
+    }
+
     /**
      * @brief Check goblin-style negate works as intended, including with points at infinity
      */
@@ -199,3 +253,8 @@ TYPED_TEST(stdlib_biggroup_goblin, neg)
 {
     TestFixture::test_goblin_style_neg();
 }
+
+TYPED_TEST(stdlib_biggroup_goblin, endomorphism_negative_k2_regression)
+{
+    TestFixture::test_endomorphism_negative_k2_regression();
+}

From 2042e8432349c333615a3520960cff1bae7aaf9b Mon Sep 17 00:00:00 2001
From: notnotraju <raju@aztec-labs.com>
Date: Mon, 23 Feb 2026 13:05:25 +0000
Subject: [PATCH 4/8] refactored the literate python program, added information
 about fq and secpk1 (to at least generate parameters)

---
 .../ecc/fields/endomorphism_scalars.py        | 647 ++++++++++++++----
 1 file changed, 510 insertions(+), 137 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
index 2831e3085eb0..5f0152d2f5e3 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
@@ -1,38 +1,137 @@
 #!/usr/bin/env python3
 """
-BN254 GLV Endomorphism: Constants and Scalar Splitting Explained
+GLV Endomorphism: Constants and Scalar Splitting for Multiple Curves
 
-This document explains the "splitting scalars" algorithm in Barretenberg. The first section
-works explicitly with Fr, the scalar field of the BN254 elliptic curve.
+This document explains the "splitting scalars" algorithm in Barretenberg for all curves
+that admit an efficient endomorphism. We cover:
+
+  Part 0   (§0):     Preliminaries — the GLV lattice and how to find a short basis
+  Part I   (§1–§5):  BN254 Fr  — the scalar field of BN254 (254-bit, uses 2^256 shift)
+  Part II  (§6–§9):  BN254 Fq  — the base field of BN254 (254-bit, uses 2^256 shift)
+  Part III (§10–§14): secp256k1 Fr — the scalar field of secp256k1 (256-bit, uses 2^384 shift)
 
 Reference: Gallant, Lambert, Vanstone, "Faster Point Multiplication on Elliptic Curves" (2001)
 
 NOTATION:
-    ≈   "approximately equal" - values differ by a small rounding error
+    p   a prime modulus (generic; each Part instantiates it)
+    λ   a non-trivial cube root of unity in F_p (λ³ = 1, λ ≠ 1)
+    ≈   "approximately equal" — values differ by a small rounding error
 """
 
+
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║                  PART 0: Preliminaries — The GLV Lattice                   ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+
 # ====================================================================================
-# § 1. INTRODUCTION
+# § 0. THE GLV LATTICE AND SHORT BASIS
 # ====================================================================================
 #
-# The BN254 elliptic curve admits an efficiently computable endomorphism φ that
-# satisfies φ(P) = λ·P where λ is a cube root of unity in the scalar field Fr.
+# Let p be a prime and λ ∈ F_p a non-trivial cube root of unity (so λ² + λ + 1 ≡ 0).
+# The GLV lattice is:
+#
+#     L = { (a, b) ∈ Z² : a + λ·b ≡ 0  (mod p) }
+#
+# Equivalently,
+#
+# L is a full-rank sublattice of Z² with determinant ±p (since the map
+# Z² → Z/pZ, (a,b) ↦ a + λb is surjective, its kernel has index p).
 #
-# This enables an optimization: any scalar multiplication k·P can be
-# decomposed as:
+# SCALAR SPLITTING.  Given a scalar k, find a lattice point (x, y) close to (k, 0).
+# Then k1 = k − x and k2 = y satisfy k ≡ k1 − λ·k2 (mod p), with both k1, k2 small.
+# Writing (x, y) = c1·(a1,b1) + c2·(a2,b2) and inverting the 2×2 basis matrix against
+# (k, 0) with det = p gives:
 #
-#     k·P = k1·P - k2·φ(P)
+#     c1 = ⌊k·b2 / p⌋,    c2 = ⌊k·(−b1) / p⌋
 #
-# where k1, k2 are approximately 127 bits each (half the size of the original
-# 254-bit scalar k). Since φ(P) is nearly free to compute (one Fq multiplication)
-# and the scalars are half-sized, this significantly accelerates scalar multiplication.
+# Note: only b1 and b2 appear; a1, a2 are not needed for the splitting, which is why
+# Barretenberg stores only b1 and b2 in the .hpp parameter files. Any lattice element
+# satisfies a ≡ -λ·b (mod p). This relation is verified for every basis we compute,
+# and allows us to recover k1 from k2 without needing a1 or a2: since
+#  c1·a1 + c2·a2 ≡ -λ·(c1·b1 + c2·b2) ≡ -λ·k2, we get k1 = k + λ·k2.
 #
-# This document explains:
-#   1. How the mathematical constants are derived
-#   2. How the algorithm guarantees k ≡ k1 - λ·k2 (mod r), where k1 and k2 are short (~127-bit) values.
+# FINDING A SHORT BASIS.  We run the Euclidean algorithm on (λ, p).  The successive
+# remainders r_i satisfy |r_i| ≈ p / (product of quotients), so they shrink from p
+# down through √p and below.  We stop at the first remainder r_j < √p and read off
+# two short lattice vectors from the Bézout coefficients at steps j−1 and j.
+#
+# The resulting vector sizes depend on the specific λ and p:
+#
+#   • BN254 (Fr and Fq): the curve is constructed from a 63-bit parameter x, and the
+#     lattice vectors are a1 = b2 = 2x+1 (64 bits), |b1| = 6x²+2x (127 bits).
+#     This asymmetric 64/127-bit pattern is a consequence of the BN parametrisation.
+#
+#   • secp256k1 Fr: no small generating parameter; the lattice vectors are all in the
+#     generic ~126–129-bit range (roughly √p for a 256-bit prime).
+#
+
+from math import isqrt
+
+def find_short_lattice_basis(lambda_val, modulus):
+    """
+    Find a short basis for the lattice L = {(a,b) : a + λ·b ≡ 0 (mod p)}.
+
+    Returns (a1, b1, a2, b2) such that:
+      - (a1, b1) and (a2, b2) are a basis for L
+      - a1 + λ·b1 ≡ 0 (mod p), a2 + λ·b2 ≡ 0 (mod p)
+      - det = a1·b2 - a2·b1 = ±p
+    """
+    # √p is the target vector length.  The lattice L has determinant p, so by
+    # Minkowski's theorem its shortest vector has length ≤ √p.  We want both
+    # basis vectors near that length so that the GLV subscalars k1, k2 are
+    # each ~ √p ≈ 2^{n/2}
+    approx_sqrt = isqrt(modulus)
+
+    # Extended Euclidean algorithm on (λ, p).
+    #
+    # Bézout invariant (maintained at every step):
+    #     remainder ≡ coeff · λ  (mod p)
+    #
+    # This holds initially (remainder = λ, coeff = 1) and is preserved by
+    # the update new_remainder = prev_remainder - quot·remainder, new_coeff =
+    # prev_coeff - quot·coeff.  It follows that (-remainder, coeff) is always
+    # a lattice vector: -remainder + λ·coeff ≡ -coeff·λ + coeff·λ ≡ 0.
+    remainder, prev_remainder = lambda_val, modulus
+    coeff,     prev_coeff     = 1, 0
+
+    # Run until the remainder first drops below √p.
+    while remainder >= approx_sqrt:
+        quot                      = prev_remainder // remainder
+        prev_remainder, remainder = remainder, prev_remainder - quot * remainder
+        prev_coeff,     coeff     = coeff,     prev_coeff     - quot * coeff
+
+    # At this point:
+    #   vec_before = (-prev_remainder, prev_coeff)  — last step above √p
+    #   vec_cross  = (-remainder,      coeff)        — first step below √p
+    vec_before = (-prev_remainder, prev_coeff)
+    vec_cross  = (-remainder,      coeff)
+
+    # One more EEA step gives an independent candidate vector.
+    quot      = prev_remainder // remainder
+    r_after   = prev_remainder - quot * remainder
+    s_after   = prev_coeff     - quot * coeff
+    vec_after = (-r_after, s_after)
+
+    # First basis vector: vec_cross (shortest, by construction).
+    # Second basis vector: shorter of vec_before and vec_after.
+    a1, b1 = vec_cross
+    a2, b2 = vec_after if (r_after**2 + s_after**2 < prev_remainder**2 + prev_coeff**2) else vec_before
+
+    # Normalise signs so that a1, a2 are positive.
+    if a1 < 0 or a1.bit_length() >= 128:
+        a1, b1 = -a1, -b1
+    if a2 < 0 or a2.bit_length() >= 128:
+        a2, b2 = -a2, -b2
+
+    return a1, b1, a2, b2
+
+
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║              PART I: BN254 Fr (Scalar Field)                               ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
 
 # ====================================================================================
-# § 2. FIELD PARAMETERS (for Fr)
+# § 1. BN254 Fr — FIELD PARAMETERS
 # ====================================================================================
 
 # The scalar field modulus of BN254 (from bn254/fr.hpp)
@@ -61,18 +160,10 @@
 
 
 # ====================================================================================
-# § 3. THE LATTICE BASIS
+# § 2. BN254 Fr — LATTICE BASIS
 # ====================================================================================
 #
-# To decompose k into short scalars k1, k2, we use a "nearest lattice vector" style of algorithm.
-# First, consider the 2D lattice:
-#
-#     L = {(a, b) ∈ Z² : a + λ·b ≡ 0 (mod r)} = ker(Z² -> Z/rZ), (a, b) ↦ a + λ·b
-#
-# NOTE: any lattice point (a, b) ∈ L satisfies a ≡ -λ·b (mod r).
-#
-# The phrase "short basis for a lattice" means two vectors with small components. Here is
-# our short basis.
+# Short basis for the GLV lattice L (see §0).
 
 a1 = 0x89d3256894d213e3                   # 64 bits
 b1 = -0x6f4d8248eeb859fc8211bbeb7d4f1128  # 127 bits (negative)
@@ -89,31 +180,19 @@
 det = (a1 * b2 - a2 * b1)
 assert abs(det) == r, "Lattice determinant ±r; hence for our vectors to be a lattice basis, they must have the same determinant (up to sign)"
 
-# These constants will later be shown to match those in barretenberg. It is worth noting
-# that in fr.hpp, we DO NOT store a1 or a2; it turns out we ONLY need b1 and b2.
+# Note: fr.hpp does NOT store a1 or a2 — only b1 and b2 are needed (see §0).
 
 # ====================================================================================
-# § 4. PRECOMPUTED CONSTANTS FOR DIVISION-FREE COMPUTATION
+# § 3. BN254 Fr — PRECOMPUTED CONSTANTS (256-bit shift)
 # ====================================================================================
 #
-# The scalar splitting algorithm (Babai's nearest plane) requires computing:
-#
-#     c1 ≈ (k·b2) / r
-#     c2 ≈ (k·(-b1)) / r
-#
-# Division by r is expensive. Instead, we _precompute_ fixed-point representations:
-#
-#     endo_g1 = ((-b1) · 2^256) // r
-#     endo_g2 = (b2 · 2^256) // r
+# Fixed-point approximations for division-free Babai rounding (see §0):
 #
-# Note that the above expressions, endo_g1 and endo_g2 DO NOT depend on k.
+#     endo_g1 = ⌊(-b1) · 2^256 / r⌋
+#     endo_g2 = ⌊b2 · 2^256 / r⌋
 #
-# Then: (endo_g2 · k) >> 256 ≈ (b2 · k) / r = c1
-#       (endo_g1 · k) >> 256 ≈ ((-b1) · k) / r = c2
-#
-# The ≈ hides two rounding errors: the floor in computing endo_g2 (or endo_g1),
-# and the floor in the >> 256 shift. However, the total error is at most 1.
-# So c1, c2 are each off by at most 1 from the exact rational values.
+# Then c1 = (endo_g2 · k) >> 256 ≈ k·b2/r and c2 = (endo_g1 · k) >> 256 ≈ k·(-b1)/r,
+# each off by at most 1 from the exact rational value.
 
 
 def compute_splitting_constants(modulus, b1, b2):
@@ -145,79 +224,30 @@ def compute_splitting_constants(modulus, b1, b2):
 
 
 # ====================================================================================
-# § 5. THE SCALAR SPLITTING ALGORITHM
+# § 4. BN254 Fr — THE 256-BIT SPLITTING ALGORITHM
 # ====================================================================================
 #
-# Given a scalar k, we compute (k1, k2) such that:
-#     k ≡ k1 - λ·k2 (mod r),    |k1|, |k2| < 2^128
-#
-# DERIVATION (Babai's nearest plane on L):
-#
-# Find a lattice point (x, y) ∈ L close to (k, 0). Setting k1 = k - x, k2 = y,
-# the lattice condition x + λ·y ≡ 0 (mod r) gives k1 - λ·k2 ≡ k (mod r).
-#
-# Writing (x, y) = c1·(a1, b1) + c2·(a2, b2), inverting the basis matrix
-# against the target (k, 0) with det = r gives:
-#
-#     c1 = ⌊k·b2 / r⌋,    c2 = ⌊k·(-b1) / r⌋
-#
-# Note: neither a1 nor a2 appear, which is why we don't store them.
-# Note: b1 is negative, hence the second expression is again non-negative.
-# Note: we can think of (k1, k2) as the "error term" to a lattice approximation of (k, 0).
-#
-# APPROXIMATE BOUNDS: Let δ1, δ2 ∈ [0,1) be the rounding errors. A simple computation
-# shows that k2 = c1·b1 + c2·b2 would be zero with exact division, so
-# |k2| ≤ |δ1·b1| + |δ2·b2| < |b1| + |b2| < 2^128. Similarly |k1| < 2^128.
-#
-# WHAT THE NAIVE IMPLEMENTATION COMPUTES:
-#
-# k2: The implementation stores -b1 (not b1), so it computes
-#     c2·b2 - c1·(-b1) = c1·b1 + c2·b2 = k2  (mod r)
-#
-# k1: Using the lattice relation ai ≡ -λ·bi (mod r), a simple computation
-# shows c1·a1 + c2·a2 ≡ -λ·k2 (mod r), so k1 = k + λ·k2. Hence:
-#     k2·λ + k = k1  (mod r)
+# Computes (k1, k2) with k ≡ k1 - λ·k2 (mod r) and |k1|, |k2| < 2^128.
+# See §0 for the derivation (Babai's nearest plane).
 #
 # SUBTLETY — k2 CAN BE NEGATIVE:
 #
-# The bound |k2| < 2^128 guarantees the MAGNITUDE fits in 128 bits, but k2
-# can be negative. When k2 < 0, the modular reduction gives t1 = k2 + r,
-# which is ~254 bits. The 128-bit truncation then extracts garbage.
+# k2 = -δ1·|b1| + δ2·b2 where δ1, δ2 ∈ [0,1) are rounding errors. This is
+# negative when δ1·|b1| > δ2·b2. Since |b1|/b2 ≈ 2^63 for BN254, even tiny
+# δ1 can cause this. It happens at k ≈ ⌈m·2^256/endo_g2⌉ where c1 ticks up
+# to m. Frequency: ~2^{-64} of all inputs.
 #
-# Recall k2 = -c1·|b1| + c2·b2, where c1 and c2 are floors of rational
-# values. Writing c1 = c1_exact - δ1, c2 = c2_exact - δ2 with δ ∈ [0,1):
+# FIX: When t1 > 128 bits (i.e. k2 < 0 wrapped mod r), add |b1|. This shifts
+# along the lattice vector (a1, b1), making k2 positive:
+#     k2_new = k2 + |b1| (positive, ~127 bits),  k1_new = k1 - a1
 #
-#     k2 = -δ1·|b1| + δ2·b2
+# ALGORITHM (split_into_endomorphism_scalars in field_declarations.hpp):
 #
-# This is negative when δ1·|b1| > δ2·b2. Since |b1|/b2 ≈ 2^63, this needs
-# δ1 > δ2·2^{-63} — i.e., δ1 can be tiny but must be nonzero.
-#
-# This happens at boundaries where c1 "ticks up" to a new integer m: at
-# k ≈ ceil(m · 2^256 / endo_g2), c1 jumps to m while c2·b2 hasn't grown
-# enough to compensate, so k2 = c2·b2 - m·|b1| < 0.
-#
-# Frequency: for each of the ~b2 ≈ 2^64 values of c1, there is a contiguous
-# range of ~2^{126} affected k values. Total: ~2^{190} / 2^{254} ≈ 2^{-64}
-# fraction. Far too rare for random testing, but easily constructed.
-#
-# FIX: When t1 has bits above position 128 (in C++: t1.data[2] or t1.data[3]
-# nonzero), we add |b1| to t1. This is equivalent to decrementing c1 by 1,
-# shifting the decomposition by the lattice vector (a1, b1). In other words
-# we change our "close lattice vector":
-#
-#     k2_new = k2 + |b1|    (now positive, ~127 bits)
-#     k1_new = k1 - a1      (shifted down by ~64 bits)
-#
-# ALGORITHM (field_declarations.hpp, split_into_endomorphism_scalars):
-#
-#   1. c1 ≈ (b2·k)/r      via c1 = (endo_g2 · k) >> 256
-#   2. c2 ≈ ((-b1)·k)/r   via c2 = (endo_g1 · k) >> 256
-#   3. q1 = c1 · (-b1)     (low 256 bits of 512-bit product)
-#   4. q2 = c2 · b2        (low 256 bits of 512-bit product)
-#   5. t1 = (q2 - q1).reduce_once()                           = k2 (mod r)
-#   6. if t1 > 128 bits: t1 = (t1 + endo_minus_b1).reduce_once()  [negative k2 fix]
-#   7. t2 = (t1 · λ + k).reduce_once()                        = k1 (mod r)
-#   8. Return low 128 bits of (t2, t1) as (k1, k2)
+#   1. c1 = (endo_g2 · k) >> 256,  c2 = (endo_g1 · k) >> 256
+#   2. t1 = (c2·b2 - c1·(-b1)) mod r                              [= k2]
+#   3. if t1 > 128 bits: t1 += endo_minus_b1                      [negative-k2 fix]
+#   4. t2 = (t1·λ + k) mod r                                      [= k1]
+#   5. Return low 128 bits of (t2, t1)
 #
 
 def split_scalar(k, modulus, beta, endo_g1, endo_g2, endo_minus_b1, endo_b2):
@@ -260,10 +290,8 @@ def split_scalar(k, modulus, beta, endo_g1, endo_g2, endo_minus_b1, endo_b2):
 
 
 # ====================================================================================
-# § 6. VERIFICATION
+# § 5. BN254 Fr — VERIFICATION
 # ====================================================================================
-#
-# We verify the algorithm on several test cases, including edge cases.
 
 def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
     """Verify correctness and bounds of the scalar split."""
@@ -277,7 +305,7 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
     verify_split(k_test, k1, k2, t1, t2, lambda_val, r)
 
 
-# § 6a. Verify the negative-k2 fix on concrete trigger inputs.
+# § 5a. Verify the negative-k2 fix on concrete trigger inputs.
 #
 # These are k = ceil(m * 2^256 / endo_g2) for m = 1, 2, 3 — the smallest k values
 # where c1 ticks up to m. Without the fix, t1 would be > 128 bits (negative k2
@@ -304,34 +332,379 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
     verify_split(k_trigger, k1, k2, t1, t2, lambda_val, r)
 
 
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║                      PART II: BN254 Fq (Base Field)                        ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+
+# ====================================================================================
+# § 6. BN254 Fq — FIELD PARAMETERS AND CUBE ROOT
+# ====================================================================================
+#
+# Fq also has a cube root of unity, so the same GLV technique applies.
+# Since Fq is also 254 bits (top limb < 2^62), the 256-bit shift algorithm is used.
+
+# The base field modulus of BN254 (from bn254/fq.hpp)
+fq_modulus = 0x30644E72E131A029B85045B68181585D97816A916871CA8D3C208C16D87CFD47
+
+# Montgomery parameter for Fq: R = 2^256 mod q
+fq_R = pow(2, 256, fq_modulus)
+fq_R_inv = pow(fq_R, -1, fq_modulus)
+
+# The cube root of unity β ∈ Fq (from bn254/fq.hpp), stored in Montgomery form
+fq_cube_root_montgomery = (
+    0x71930c11d782e155 |
+    (0xa6bb947cffbe3323 << 64) |
+    (0xaa303344d4741444 << 128) |
+    (0x2c3b3f0d26594943 << 192)
+)
+
+# Convert from Montgomery form to standard form
+fq_beta = (fq_cube_root_montgomery * fq_R_inv) % fq_modulus
+
+# Verify that β is a non-trivial cube root of unity in Fq
+assert (pow(fq_beta, 2, fq_modulus) + fq_beta + 1) % fq_modulus == 0, "β² + β + 1 ≡ 0 (mod q)"
+assert fq_beta != 1, "β must be non-trivial"
+
+
 # ====================================================================================
-# § 7. SUMMARY
+# § 7. BN254 Fq — LATTICE BASIS
 # ====================================================================================
 #
-# The GLV endomorphism optimization for BN254:
+# Derive the short lattice basis for Fq using find_short_lattice_basis (§0).
+
+fq_a1, fq_b1, fq_a2, fq_b2 = find_short_lattice_basis(fq_beta, fq_modulus)
+
+# Verify lattice membership
+assert (fq_a1 + fq_beta * fq_b1) % fq_modulus == 0, "Fq lattice vector 1"
+assert (fq_a2 + fq_beta * fq_b2) % fq_modulus == 0, "Fq lattice vector 2"
+
+# Verify determinant
+fq_det = fq_a1 * fq_b2 - fq_a2 * fq_b1
+assert abs(fq_det) == fq_modulus, f"Fq lattice determinant must be ±q, got {fq_det}"
+
+
+# ====================================================================================
+# § 8. BN254 Fq — PRECOMPUTED CONSTANTS AND VERIFICATION
+# ====================================================================================
+
+fq_endo_g1, fq_endo_g2, fq_endo_minus_b1, fq_endo_b2 = compute_splitting_constants(
+    fq_modulus, fq_b1, fq_b2
+)
+
+# Verify these match the values in bn254/fq.hpp
+fq_expected_endo_g1 = 0x7a7bd9d4391eb18d | (0x4ccef014a773d2cf << 64) | (0x2 << 128)
+fq_expected_endo_g2 = 0xd91d232ec7e0b3d2 | (0x2 << 64)
+fq_expected_endo_minus_b1 = 0x8211bbeb7d4f1129 | (0x6f4d8248eeb859fc << 64)
+fq_expected_endo_b2 = 0x89d3256894d213e2
+
+assert fq_endo_g1 == fq_expected_endo_g1, (
+    f"Fq endo_g1 mismatch: got {hex(fq_endo_g1)}, expected {hex(fq_expected_endo_g1)}"
+)
+assert fq_endo_g2 == fq_expected_endo_g2, (
+    f"Fq endo_g2 mismatch: got {hex(fq_endo_g2)}, expected {hex(fq_expected_endo_g2)}"
+)
+assert fq_endo_minus_b1 == fq_expected_endo_minus_b1, (
+    f"Fq endo_minus_b1 mismatch: got {hex(fq_endo_minus_b1)}, expected {hex(fq_expected_endo_minus_b1)}"
+)
+assert fq_endo_b2 == fq_expected_endo_b2, (
+    f"Fq endo_b2 mismatch: got {hex(fq_endo_b2)}, expected {hex(fq_expected_endo_b2)}"
+)
+
+
+# ====================================================================================
+# § 9. BN254 Fq — SPLITTING VERIFICATION
+# ====================================================================================
+
+for k_test in [0, 1, 42, fq_beta, fq_modulus - 1]:
+    k1, k2, t1, t2 = split_scalar(
+        k_test, fq_modulus, fq_beta, fq_endo_g1, fq_endo_g2, fq_endo_minus_b1, fq_endo_b2
+    )
+    verify_split(k_test, k1, k2, t1, t2, fq_beta, fq_modulus)
+
+# Verify negative-k2 triggers for Fq
+for m in [1, 2, 3]:
+    k_trigger = (m * (1 << 256) + fq_endo_g2 - 1) // fq_endo_g2
+    if k_trigger < fq_modulus:
+        k1, k2, t1, t2 = split_scalar(
+            k_trigger, fq_modulus, fq_beta, fq_endo_g1, fq_endo_g2, fq_endo_minus_b1, fq_endo_b2
+        )
+        verify_split(k_trigger, k1, k2, t1, t2, fq_beta, fq_modulus)
+
+
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║               PART III: secp256k1 Fr (Scalar Field)                        ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+
+# ====================================================================================
+# § 10. secp256k1 Fr — FIELD PARAMETERS
+# ====================================================================================
 #
-# 1. The BN254 curve has an endomorphism φ(x,y) = (β·x, y) satisfying φ(P) = λ·P
-# 2. We can decompose any scalar k as k ≡ k1 - λ·k2 (mod r) with k1, k2 ≈ 127 bits
-# 3. The decomposition uses Babai's nearest plane algorithm on a short lattice basis
-# 4. Precomputed constants enable division-free computation via fixed-point arithmetic
-# 5. The algorithm guarantees both k1 and k2 fit in 128 bits (actually ≤ 127 bits)
-# 6. For ~2^{-64} of inputs, k2 is slightly negative; the fix detects this (upper
-#    limbs nonzero) and shifts along a lattice vector to make k2 positive
+# secp256k1's scalar field modulus is a full 256 bits (top limb = 0xFFFF...),
+# exceeding MODULUS_TOP_LIMB_LARGE_THRESHOLD (2^62). AUDITTODO: explain *exactly* the rounding issues that force this.
+# This requires:
+#   - 2^384 shift instead of 2^256 (a >>256 shift loses precision for 256-bit moduli)
+#   - 4-limb endo_g constants (lo/mid/hi/hihi)
+#   - Montgomery field multiplication in split_into_endomorphism_scalars_384
+
+# The scalar field modulus of secp256k1 (from secp256k1.hpp, FrParams)
+secp_r = (
+    0xBFD25E8CD0364141 |
+    (0xBAAEDCE6AF48A03B << 64) |
+    (0xFFFFFFFFFFFFFFFE << 128) |
+    (0xFFFFFFFFFFFFFFFF << 192)
+)
+
+# Montgomery parameter for secp256k1 Fr: R = 2^256 mod r
+secp_R = pow(2, 256, secp_r)
+secp_R_inv = pow(secp_R, -1, secp_r)
+
+# The cube root of unity λ ∈ secp256k1::Fr (from secp256k1.hpp FrParams), in Montgomery form
+secp_cube_root_montgomery = (
+    0xf07deb3dc9926c9e |
+    (0x2c93e7ad83c6944c << 64) |
+    (0x73a9660652697d91 << 128) |
+    (0x532840178558d639 << 192)
+)
+
+# Convert from Montgomery form to standard form
+secp_lambda = (secp_cube_root_montgomery * secp_R_inv) % secp_r
+
+# Verify that λ is a non-trivial cube root of unity in secp256k1 Fr
+assert (pow(secp_lambda, 2, secp_r) + secp_lambda + 1) % secp_r == 0, "λ² + λ + 1 ≡ 0 (mod r)"
+
+
+# ====================================================================================
+# § 11. secp256k1 Fr — LATTICE BASIS
+# ====================================================================================
+#
+# See §0 for why these vectors are ~126–129 bits (unlike BN254's 64/127 pattern).
+
+secp_a1, secp_b1, secp_a2, secp_b2 = find_short_lattice_basis(secp_lambda, secp_r)
+
+# Verify lattice membership
+assert (secp_a1 + secp_lambda * secp_b1) % secp_r == 0, "secp256k1 lattice vector 1"
+assert (secp_a2 + secp_lambda * secp_b2) % secp_r == 0, "secp256k1 lattice vector 2"
+
+# Verify determinant
+secp_det = secp_a1 * secp_b2 - secp_a2 * secp_b1
+assert abs(secp_det) == secp_r, "secp256k1 lattice determinant must be ±r"
+
+
+# ====================================================================================
+# § 12. secp256k1 Fr — PRECOMPUTED CONSTANTS (384-bit shift)
+# ====================================================================================
+#
+# In the 384-bit code, the naming is "cross-paired" — g1 is paired with minus_b1,
+# and g2 is paired with b2 (the opposite of what you might expect):
+#
+#     endo_g1 = ⌈b2 · 2^384 / r⌉
+#     endo_g2 = ⌊(-b1) · 2^384 / r⌋
+#
+# Note: secp256k1_endo_notes.hpp uses the opposite naming convention for g1/g2,
+# but the STORED values in FrParams follow this cross-paired convention.
+
+def compute_splitting_constants_384(modulus, b1, b2):
+    """
+    Compute the precomputed constants for the 384-bit shift variant.
+
+    Returns (endo_g1, endo_g2, endo_minus_b1, endo_b2) matching the hpp file.
+
+    Convention: endo_g1 is the b2-based approximation (cross-paired with minus_b1),
+                endo_g2 is the (-b1)-based approximation (cross-paired with b2).
+    """
+    shift = 1 << 384
+    # endo_g1 = ceil(b2 * 2^384 / r)  — cross-paired with minus_b1 in the algorithm
+    endo_g1 = -((-b2 * shift) // modulus)
+    # endo_g2 = floor((-b1) * 2^384 / r) — cross-paired with b2 in the algorithm
+    endo_g2 = ((-b1) * shift) // modulus
+    endo_minus_b1 = (-b1) % modulus
+    endo_b2 = b2 % modulus
+    return endo_g1, endo_g2, endo_minus_b1, endo_b2
+
+
+secp_endo_g1, secp_endo_g2, secp_endo_minus_b1, secp_endo_b2 = compute_splitting_constants_384(
+    secp_r, secp_b1, secp_b2
+)
+
+# Verify these match the values in secp256k1.hpp (FrParams)
+# endo_g1 is stored as (lo, mid, hi, hihi) — 4 × 64-bit limbs
+secp_expected_endo_g1 = (
+    0xE893209A45DBB031 |
+    (0x3DAA8A1471E8CA7F << 64) |
+    (0xE86C90E49284EB15 << 128) |
+    (0x3086D221A7D46BCD << 192)
+)
+secp_expected_endo_g2 = (
+    0x1571B4AE8AC47F71 |
+    (0x221208AC9DF506C6 << 64) |
+    (0x6F547FA90ABFE4C4 << 128) |
+    (0xE4437ED6010E8828 << 192)
+)
+secp_expected_endo_minus_b1 = 0x6F547FA90ABFE4C3 | (0xE4437ED6010E8828 << 64)
+secp_expected_endo_b2 = 0xe86c90e49284eb15 | (0x3086d221a7d46bcd << 64)
+
+assert secp_endo_g1 == secp_expected_endo_g1, (
+    f"secp256k1 endo_g1 mismatch:\n  got      {hex(secp_endo_g1)}\n  expected {hex(secp_expected_endo_g1)}"
+)
+assert secp_endo_g2 == secp_expected_endo_g2, (
+    f"secp256k1 endo_g2 mismatch:\n  got      {hex(secp_endo_g2)}\n  expected {hex(secp_expected_endo_g2)}"
+)
+assert secp_endo_minus_b1 == secp_expected_endo_minus_b1, (
+    f"secp256k1 endo_minus_b1 mismatch:\n  got      {hex(secp_endo_minus_b1)}\n  expected {hex(secp_expected_endo_minus_b1)}"
+)
+assert secp_endo_b2 == secp_expected_endo_b2, (
+    f"secp256k1 endo_b2 mismatch:\n  got      {hex(secp_endo_b2)}\n  expected {hex(secp_expected_endo_b2)}"
+)
+
+
+# ====================================================================================
+# § 13. secp256k1 Fr — THE 384-BIT SPLITTING ALGORITHM
+# ====================================================================================
+#
+# Unlike the 256-bit variant, there is no explicit negative-k2 fix — field
+# subtraction handles signs. The c1, c2 values are converted to Montgomery form
+# and multiplied via field ops (which reduce mod r automatically).
+#
+# ALGORITHM (split_into_endomorphism_scalars_384 in field_declarations.hpp):
+#
+#   1. c1 = (endo_g1 · k) >> 384,  c2 = (endo_g2 · k) >> 384
+#   2. r2f = c1·(-b1) - c2·b2      (cross-products, computed as field elements)
+#   3. r1f = k - r2f·λ
+#   4. k1 = r1f, k2 = -r2f
+
+def split_scalar_384(k, modulus, lambda_val, endo_g1, endo_g2, endo_minus_b1, endo_b2):
+    """
+    Split scalar k using the 384-bit shift variant.
+
+    Implements split_into_endomorphism_scalars_384() in field_declarations.hpp.
+
+    Returns:
+        (k1, k2): The split scalars such that k ≡ k1 - λ·k2 (mod r)
+    """
+    input_val = k % modulus
+
+    # c1 ≈ k·b2/r,  c2 ≈ k·(-b1)/r
+    c1 = (endo_g1 * input_val) >> 384
+    c2 = (endo_g2 * input_val) >> 384
+
+    # Cross-products (computed as field elements in C++ via Montgomery)
+    c1_times_minus_b1 = (c1 * endo_minus_b1) % modulus
+    c2_times_b2 = (c2 * endo_b2) % modulus
+
+    # r2f = c1·(-b1) - c2·b2 (nearly cancels, leaving small lattice error)
+    r2f = (c1_times_minus_b1 - c2_times_b2) % modulus
+
+    # r1f = k - r2f·λ
+    r1f = (input_val - r2f * lambda_val) % modulus
+
+    # k1 = r1f, k2 = -r2f;  invariant: k ≡ k1 - λ·k2 (mod r)
+    k1 = r1f
+    k2 = (-r2f) % modulus
+
+    return k1, k2
+
+
+# ====================================================================================
+# § 14. secp256k1 Fr — SPLITTING VERIFICATION
+# ====================================================================================
+
+def verify_split_384(k, k1, k2, lambda_val, modulus):
+    """Verify correctness of the 384-bit scalar split."""
+    # The invariant is k ≡ k1 - λ·k2 (mod r)
+    reconstructed = (k1 - lambda_val * k2) % modulus
+    assert reconstructed == k % modulus, (
+        f"k ≡ k1 - λ·k2 failed for k={hex(k)}\n"
+        f"  k1={hex(k1)}, k2={hex(k2)}\n"
+        f"  reconstructed={hex(reconstructed)}, expected={hex(k % modulus)}"
+    )
+    # For the 384-bit variant, k1 and k2 are field elements; they should be small
+    # enough that the decomposition is useful. We verify they fit in ~129 bits.
+    # (The C++ code does not explicitly truncate to 128 bits in this path;
+    # the values may be slightly larger than in the 256-bit path.)
+    k1_eff = k1 if k1 <= modulus // 2 else modulus - k1
+    k2_eff = k2 if k2 <= modulus // 2 else modulus - k2
+    assert k1_eff.bit_length() <= 129, (
+        f"k1 effective magnitude has {k1_eff.bit_length()} bits (> 129) for k={hex(k)}"
+    )
+    assert k2_eff.bit_length() <= 129, (
+        f"k2 effective magnitude has {k2_eff.bit_length()} bits (> 129) for k={hex(k)}"
+    )
+
+
+for k_test in [0, 1, 42, secp_lambda, secp_r - 1, secp_r // 2, secp_r // 3]:
+    k1, k2 = split_scalar_384(
+        k_test, secp_r, secp_lambda, secp_endo_g1, secp_endo_g2, secp_endo_minus_b1, secp_endo_b2
+    )
+    verify_split_384(k_test, k1, k2, secp_lambda, secp_r)
+
+
+# Also verify with the cube root of unity in the BASE field (secp256k1 Fq).
+# The base field Fq of secp256k1 has modulus p = 2^256 - 2^32 - 977, which also
+# has a cube root of unity β. This β is what gets multiplied with the x-coordinate
+# in the endomorphism φ(x,y) = (β·x, y). Let's verify it.
+
+secp_fq_modulus = (
+    0xFFFFFFFEFFFFFC2F |
+    (0xFFFFFFFFFFFFFFFF << 64) |
+    (0xFFFFFFFFFFFFFFFF << 128) |
+    (0xFFFFFFFFFFFFFFFF << 192)
+)
+
+secp_fq_R = pow(2, 256, secp_fq_modulus)
+secp_fq_R_inv = pow(secp_fq_R, -1, secp_fq_modulus)
+
+secp_fq_cube_root_montgomery = (
+    0x58a4361c8e81894e |
+    (0x03fde1631c4b80af << 64) |
+    (0xf8e98978d02e3905 << 128) |
+    (0x7a4a36aebcbb3d53 << 192)
+)
+
+secp_fq_beta = (secp_fq_cube_root_montgomery * secp_fq_R_inv) % secp_fq_modulus
+assert pow(secp_fq_beta, 3, secp_fq_modulus) == 1, "β³ ≡ 1 (mod p) for secp256k1 Fq"
+assert secp_fq_beta != 1, "β must be non-trivial"
+
+
+# ====================================================================================
+# § 15. SUMMARY
+# ====================================================================================
 #
-# Performance: Instead of one 254-bit scalar multiplication k·P, we compute
-# k1·P - k2·φ(P) with two ~127-bit scalars processed via interleaved WNAF,
-# reducing the number of doublings by roughly half.
+# Derived and verified GLV endomorphism constants for:
+#   - BN254 Fr     (§1–§5):   254-bit, 256-bit shift, constants match bn254/fr.hpp
+#   - BN254 Fq     (§6–§9):   254-bit, 256-bit shift, constants match bn254/fq.hpp
+#   - secp256k1 Fr  (§10–§14): 256-bit, 384-bit shift, constants match secp256k1.hpp
+#   - secp256k1 Fq cube root β also verified (end of Part III)
 #
-# References:
-#   • Gallant, Lambert, Vanstone: "Faster Point Multiplication on Elliptic Curves
-#     with Efficient Endomorphisms", CRYPTO 2001
-#   • https://www.iacr.org/archive/crypto2001/21390189.pdf
+# Architectural split: MODULUS_TOP_LIMB_LARGE_THRESHOLD (2^62) determines whether
+# split_into_endomorphism_scalars (256-bit) or _384 (384-bit) is used.
 
 if __name__ == "__main__":
-    print("BN254 GLV Endomorphism - All verifications passed!")
-    print(f"  λ (cube root of unity): {hex(lambda_val)}")
-    print(f"  endo_g1: {hex(endo_g1)}")
-    print(f"  endo_g2: {hex(endo_g2)}")
+    print("=== Part I: BN254 Fr ===")
+    print(f"  λ (cube root): {hex(lambda_val)}")
+    print(f"  endo_g1:       {hex(endo_g1)}")
+    print(f"  endo_g2:       {hex(endo_g2)}")
     print(f"  endo_minus_b1: {hex(endo_minus_b1)}")
-    print(f"  endo_b2: {hex(endo_b2)}")
-    print("\nConstants match barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.hpp")
+    print(f"  endo_b2:       {hex(endo_b2)}")
+    print("  -> Constants match bn254/fr.hpp")
+
+    print("\n=== Part II: BN254 Fq ===")
+    print(f"  β (cube root): {hex(fq_beta)}")
+    print(f"  endo_g1:       {hex(fq_endo_g1)}")
+    print(f"  endo_g2:       {hex(fq_endo_g2)}")
+    print(f"  endo_minus_b1: {hex(fq_endo_minus_b1)}")
+    print(f"  endo_b2:       {hex(fq_endo_b2)}")
+    print("  -> Constants match bn254/fq.hpp")
+
+    print("\n=== Part III: secp256k1 Fr ===")
+    print(f"  λ (cube root): {hex(secp_lambda)}")
+    print(f"  endo_g1:       {hex(secp_endo_g1)}")
+    print(f"  endo_g2:       {hex(secp_endo_g2)}")
+    print(f"  endo_minus_b1: {hex(secp_endo_minus_b1)}")
+    print(f"  endo_b2:       {hex(secp_endo_b2)}")
+    print("  -> Constants match secp256k1.hpp FrParams")
+
+    print("\n=== secp256k1 Fq (base field) ===")
+    print(f"  β (cube root): {hex(secp_fq_beta)}")
+    print("  -> Cube root verified")
+
+    print("\nAll verifications passed!")

From f9be7cafedd9a9a0eaae68a8baeaa4429c18ce6b Mon Sep 17 00:00:00 2001
From: notnotraju <raju@aztec-labs.com>
Date: Tue, 24 Feb 2026 12:04:55 +0000
Subject: [PATCH 5/8] unified interfaces for BN fields and secp256k1, removed
 the "384-bit" branch.

---
 .../ecc/curves/secp256k1/secp256k1.hpp        |  17 +-
 .../curves/secp256k1/secp256k1_endo_notes.hpp |   6 +-
 .../ecc/fields/endomorphism_scalars.py        | 250 +++++++++---------
 .../ecc/fields/field_declarations.hpp         | 173 ++++++------
 4 files changed, 207 insertions(+), 239 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp b/barretenberg/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp
index 3e68f5e2bbec..323c002898d6 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp
@@ -217,15 +217,14 @@ struct FrParams {
     static constexpr uint64_t endo_b2_lo = 0xe86c90e49284eb15ULL;
     static constexpr uint64_t endo_b2_mid = 0x3086d221a7d46bcdULL;
 
-    static constexpr uint64_t endo_g1_lo = 0xE893209A45DBB031ULL;
-    static constexpr uint64_t endo_g1_mid = 0x3DAA8A1471E8CA7FULL;
-    static constexpr uint64_t endo_g1_hi = 0xE86C90E49284EB15ULL;
-    static constexpr uint64_t endo_g1_hihi = 0x3086D221A7D46BCDULL;
-
-    static constexpr uint64_t endo_g2_lo = 0x1571B4AE8AC47F71ULL;
-    static constexpr uint64_t endo_g2_mid = 0x221208AC9DF506C6ULL;
-    static constexpr uint64_t endo_g2_hi = 0x6F547FA90ABFE4C4ULL;
-    static constexpr uint64_t endo_g2_hihi = 0xE4437ED6010E8828ULL;
+    // 256-bit-shift constants: g1 = floor((-b1) * 2^256 / r), g2 = floor(b2 * 2^256 / r)
+    // See endomorphism_scalars.py compute_splitting_constants() for derivation.
+    static constexpr uint64_t endo_g1_lo = 0x6F547FA90ABFE4C4ULL;
+    static constexpr uint64_t endo_g1_mid = 0xE4437ED6010E8828ULL;
+    static constexpr uint64_t endo_g1_hi = 0x0ULL;
+
+    static constexpr uint64_t endo_g2_lo = 0xE86C90E49284EB15ULL;
+    static constexpr uint64_t endo_g2_mid = 0x3086D221A7D46BCDULL;
 
     // Not used in secp256k1
     static constexpr uint64_t primitive_root_0 = 0UL;
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1_endo_notes.hpp b/barretenberg/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1_endo_notes.hpp
index 55de295e6cc0..2e42c954b2e9 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1_endo_notes.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1_endo_notes.hpp
@@ -14,11 +14,9 @@ struct basis_vectors {
     uint64_t endo_g1_lo = 0;
     uint64_t endo_g1_mid = 0;
     uint64_t endo_g1_hi = 0;
-    uint64_t endo_g1_hihi = 0;
     uint64_t endo_g2_lo = 0;
     uint64_t endo_g2_mid = 0;
     uint64_t endo_g2_hi = 0;
-    uint64_t endo_g2_hihi = 0;
     uint64_t endo_minus_b1_lo = 0;
     uint64_t endo_minus_b1_mid = 0;
     uint64_t endo_b2_lo = 0;
@@ -108,7 +106,7 @@ struct basis_vectors {
     }
 
     uint512_t minus_b1 = -b1;
-    uint512_t shift256 = uint512_t(1) << 384;
+    uint512_t shift256 = uint512_t(1) << 256;
     uint512_t g1 = (-b1 * shift256) / uint512_t(secp256k1::fr::modulus);
     uint512_t g2 = (b2 * shift256) / uint512_t(secp256k1::fr::modulus);
 
@@ -116,11 +114,9 @@ struct basis_vectors {
     result.endo_g1_lo = g1.lo.data[0];
     result.endo_g1_mid = g1.lo.data[1];
     result.endo_g1_hi = g1.lo.data[2];
-    result.endo_g1_hihi = g1.lo.data[3];
     result.endo_g2_lo = g2.lo.data[0];
     result.endo_g2_mid = g2.lo.data[1];
     result.endo_g2_hi = g2.lo.data[2];
-    result.endo_g2_hihi = g2.lo.data[3];
     result.endo_minus_b1_lo = minus_b1.lo.data[0];
     result.endo_minus_b1_mid = minus_b1.lo.data[1];
     result.endo_b2_lo = b2.lo.data[0];
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
index 5f0152d2f5e3..c3a139bdf265 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
@@ -8,7 +8,7 @@
   Part 0   (§0):     Preliminaries — the GLV lattice and how to find a short basis
   Part I   (§1–§5):  BN254 Fr  — the scalar field of BN254 (254-bit, uses 2^256 shift)
   Part II  (§6–§9):  BN254 Fq  — the base field of BN254 (254-bit, uses 2^256 shift)
-  Part III (§10–§14): secp256k1 Fr — the scalar field of secp256k1 (256-bit, uses 2^384 shift)
+  Part III (§10–§14): secp256k1 Fr — the scalar field of secp256k1 (256-bit, now uses 2^256 shift)
 
 Reference: Gallant, Lambert, Vanstone, "Faster Point Multiplication on Elliptic Curves" (2001)
 
@@ -92,24 +92,24 @@ def find_short_lattice_basis(lambda_val, modulus):
     # prev_coeff - quot·coeff.  It follows that (-remainder, coeff) is always
     # a lattice vector: -remainder + λ·coeff ≡ -coeff·λ + coeff·λ ≡ 0.
     remainder, prev_remainder = lambda_val, modulus
-    coeff,     prev_coeff     = 1, 0
+    coeff, prev_coeff = 1, 0
 
     # Run until the remainder first drops below √p.
     while remainder >= approx_sqrt:
-        quot                      = prev_remainder // remainder
+        quot = prev_remainder // remainder
         prev_remainder, remainder = remainder, prev_remainder - quot * remainder
-        prev_coeff,     coeff     = coeff,     prev_coeff     - quot * coeff
+        prev_coeff, coeff = coeff, prev_coeff - quot * coeff
 
     # At this point:
-    #   vec_before = (-prev_remainder, prev_coeff)  — last step above √p
-    #   vec_cross  = (-remainder,      coeff)        — first step below √p
+    #   vec_before = (-prev_remainder, prev_coeff) — last step above √p
+    #   vec_cross = (-remainder, coeff) — first step below √p
     vec_before = (-prev_remainder, prev_coeff)
-    vec_cross  = (-remainder,      coeff)
+    vec_cross = (-remainder, coeff)
 
     # One more EEA step gives an independent candidate vector.
-    quot      = prev_remainder // remainder
-    r_after   = prev_remainder - quot * remainder
-    s_after   = prev_coeff     - quot * coeff
+    quot = prev_remainder // remainder
+    r_after = prev_remainder - quot * remainder
+    s_after = prev_coeff - quot * coeff
     vec_after = (-r_after, s_after)
 
     # First basis vector: vec_cross (shortest, by construction).
@@ -126,8 +126,49 @@ def find_short_lattice_basis(lambda_val, modulus):
     return a1, b1, a2, b2
 
 
+# ====================================================================================
+# § 0a. THE 256-BIT-SHIFT APPROXIMATION — ERROR BOUND PROOF
+# ====================================================================================
+#
+# CLAIM: For any prime r < 2^256 and any b with |b| < r, define g = floor(b * 2^256 / r).
+# Then for every k in [0, r):
+#
+#     floor(g * k / 2^256) ∈ { floor(b * k / r),  floor(b * k / r) - 1 }
+#
+# i.e., the approximation error is in {0, -1}.  This holds for ALL curves with
+# r < 2^256 — BN254, secp256k1, and any other.
+# (Note that we used to use a 384 bit shift for secp256k1.)
+#
+# PROOF:
+#
+# Write the Euclidean division of b * 2^256 by r:
+#
+#     b * 2^256 = g * r + ε        where  0 ≤ ε < r           ...(1)
+#
+# Rearranging:  g = (b * 2^256 - ε) / r.  Multiply both sides by k:
+#
+#     g * k = b * k * 2^256 / r  -  ε * k / r
+#
+# Dividing by 2^256:
+#
+#     g * k / 2^256 = b * k / r  -  ε * k / (r * 2^256)       ...(2)
+#
+# The correction term δ := ε * k / (r * 2^256) satisfies:
+#
+#     0 ≤ δ = ε * k / (r * 2^256) < r * r / (r * 2^256)
+#            = r / 2^256 < 1                                   ...(3)
+#
+# (using ε < r, k < r, and r < 2^256).
+#
+# From (2) and (3):
+#
+#     b*k/r - 1 < g*k/2^256 ≤ b*k/r                           ...(4)
+#
+# Taking floors of (4): if b*k/r = q + f where q = floor(b*k/r) and 0 ≤ f < 1,
+# then g*k/2^256 ∈ (q + f - 1, q + f], so floor(g*k/2^256) ∈ {q-1, q}.       ∎
+
 # ╔══════════════════════════════════════════════════════════════════════════════╗
-# ║              PART I: BN254 Fr (Scalar Field)                               ║
+# ║              PART I: BN254 Fr (Scalar Field)                                 ║
 # ╚══════════════════════════════════════════════════════════════════════════════╝
 
 # ====================================================================================
@@ -235,7 +276,8 @@ def compute_splitting_constants(modulus, b1, b2):
 # k2 = -δ1·|b1| + δ2·b2 where δ1, δ2 ∈ [0,1) are rounding errors. This is
 # negative when δ1·|b1| > δ2·b2. Since |b1|/b2 ≈ 2^63 for BN254, even tiny
 # δ1 can cause this. It happens at k ≈ ⌈m·2^256/endo_g2⌉ where c1 ticks up
-# to m. Frequency: ~2^{-64} of all inputs.
+# to m. Note that the ≈ means that it can happen for _many_ k around/slightly greater than that number.
+# Frequency: ~2^{-64} of all inputs.
 #
 # FIX: When t1 > 128 bits (i.e. k2 < 0 wrapped mod r), add |b1|. This shifts
 # along the lattice vector (a1, b1), making k2 positive:
@@ -333,7 +375,7 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 
 
 # ╔══════════════════════════════════════════════════════════════════════════════╗
-# ║                      PART II: BN254 Fq (Base Field)                        ║
+# ║                      PART II: BN254 Fq (Base Field)                          ║
 # ╚══════════════════════════════════════════════════════════════════════════════╝
 
 # ====================================================================================
@@ -432,7 +474,7 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 
 
 # ╔══════════════════════════════════════════════════════════════════════════════╗
-# ║               PART III: secp256k1 Fr (Scalar Field)                        ║
+# ║               PART III: secp256k1 Fr (Scalar Field)                          ║
 # ╚══════════════════════════════════════════════════════════════════════════════╝
 
 # ====================================================================================
@@ -440,11 +482,16 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 # ====================================================================================
 #
 # secp256k1's scalar field modulus is a full 256 bits (top limb = 0xFFFF...),
-# exceeding MODULUS_TOP_LIMB_LARGE_THRESHOLD (2^62). AUDITTODO: explain *exactly* the rounding issues that force this.
-# This requires:
-#   - 2^384 shift instead of 2^256 (a >>256 shift loses precision for 256-bit moduli)
-#   - 4-limb endo_g constants (lo/mid/hi/hihi)
-#   - Montgomery field multiplication in split_into_endomorphism_scalars_384
+# exceeding MODULUS_TOP_LIMB_LARGE_THRESHOLD (2^62). As proved in §0a, the
+# 256-bit shift approximation is sufficient for ANY prime r < 2^256 — the
+# error is bounded to {0, -1}. The old 384-bit path was only needed because
+# the 256-bit C++ code truncated outputs to 128 bits, clipping the 129th bit
+# that appears for ~26% of secp256k1 inputs. With full-width output, 256-bit
+# shift works perfectly.
+#
+# For secp256k1, the large-modulus branch of split_into_endomorphism_scalars
+# returns full field elements (not truncated to 128 bits). The caller
+# (biggroup_nafs.hpp) handles signs by inspecting the MSB of k2.
 
 # The scalar field modulus of secp256k1 (from secp256k1.hpp, FrParams)
 secp_r = (
@@ -491,55 +538,26 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 
 
 # ====================================================================================
-# § 12. secp256k1 Fr — PRECOMPUTED CONSTANTS (384-bit shift)
+# § 12. secp256k1 Fr — PRECOMPUTED CONSTANTS (256-bit shift)
 # ====================================================================================
 #
-# In the 384-bit code, the naming is "cross-paired" — g1 is paired with minus_b1,
-# and g2 is paired with b2 (the opposite of what you might expect):
+# secp256k1 now uses the SAME 256-bit shift as BN254 (see §0a for the proof that
+# this is sufficient for any r < 2^256). The naming convention matches BN254:
 #
-#     endo_g1 = ⌈b2 · 2^384 / r⌉
-#     endo_g2 = ⌊(-b1) · 2^384 / r⌋
+#     endo_g1 = ⌊(-b1) · 2^256 / r⌋
+#     endo_g2 = ⌊b2 · 2^256 / r⌋
 #
-# Note: secp256k1_endo_notes.hpp uses the opposite naming convention for g1/g2,
-# but the STORED values in FrParams follow this cross-paired convention.
-
-def compute_splitting_constants_384(modulus, b1, b2):
-    """
-    Compute the precomputed constants for the 384-bit shift variant.
+# We reuse compute_splitting_constants() from §3 — same function, same shift.
 
-    Returns (endo_g1, endo_g2, endo_minus_b1, endo_b2) matching the hpp file.
-
-    Convention: endo_g1 is the b2-based approximation (cross-paired with minus_b1),
-                endo_g2 is the (-b1)-based approximation (cross-paired with b2).
-    """
-    shift = 1 << 384
-    # endo_g1 = ceil(b2 * 2^384 / r)  — cross-paired with minus_b1 in the algorithm
-    endo_g1 = -((-b2 * shift) // modulus)
-    # endo_g2 = floor((-b1) * 2^384 / r) — cross-paired with b2 in the algorithm
-    endo_g2 = ((-b1) * shift) // modulus
-    endo_minus_b1 = (-b1) % modulus
-    endo_b2 = b2 % modulus
-    return endo_g1, endo_g2, endo_minus_b1, endo_b2
-
-
-secp_endo_g1, secp_endo_g2, secp_endo_minus_b1, secp_endo_b2 = compute_splitting_constants_384(
+secp_endo_g1, secp_endo_g2, secp_endo_minus_b1, secp_endo_b2 = compute_splitting_constants(
     secp_r, secp_b1, secp_b2
 )
 
 # Verify these match the values in secp256k1.hpp (FrParams)
-# endo_g1 is stored as (lo, mid, hi, hihi) — 4 × 64-bit limbs
-secp_expected_endo_g1 = (
-    0xE893209A45DBB031 |
-    (0x3DAA8A1471E8CA7F << 64) |
-    (0xE86C90E49284EB15 << 128) |
-    (0x3086D221A7D46BCD << 192)
-)
-secp_expected_endo_g2 = (
-    0x1571B4AE8AC47F71 |
-    (0x221208AC9DF506C6 << 64) |
-    (0x6F547FA90ABFE4C4 << 128) |
-    (0xE4437ED6010E8828 << 192)
-)
+# endo_g1 is stored as (lo, mid, hi) — only 2 non-zero limbs for secp256k1
+# (hi = 0 because (-b1) * 2^256 / r fits in 128 bits for this curve)
+secp_expected_endo_g1 = 0x6F547FA90ABFE4C4 | (0xE4437ED6010E8828 << 64)
+secp_expected_endo_g2 = 0xE86C90E49284EB15 | (0x3086D221A7D46BCD << 64)
 secp_expected_endo_minus_b1 = 0x6F547FA90ABFE4C3 | (0xE4437ED6010E8828 << 64)
 secp_expected_endo_b2 = 0xe86c90e49284eb15 | (0x3086d221a7d46bcd << 64)
 
@@ -558,48 +576,52 @@ def compute_splitting_constants_384(modulus, b1, b2):
 
 
 # ====================================================================================
-# § 13. secp256k1 Fr — THE 384-BIT SPLITTING ALGORITHM
+# § 13. secp256k1 Fr — THE 256-BIT SPLITTING ALGORITHM
 # ====================================================================================
 #
-# Unlike the 256-bit variant, there is no explicit negative-k2 fix — field
-# subtraction handles signs. The c1, c2 values are converted to Montgomery form
-# and multiplied via field ops (which reduce mod r automatically).
+# secp256k1 uses the same core computation as BN254 (compute_endomorphism_k2 in
+# field_declarations.hpp), but WITHOUT the negative-k2 fix. The result k2 is a
+# full field element (up to ~129 bits); the caller handles signs.
 #
-# ALGORITHM (split_into_endomorphism_scalars_384 in field_declarations.hpp):
+# ALGORITHM (the `else` branch of split_into_endomorphism_scalars in
+# field_declarations.hpp, for large moduli):
 #
-#   1. c1 = (endo_g1 · k) >> 384,  c2 = (endo_g2 · k) >> 384
-#   2. r2f = c1·(-b1) - c2·b2      (cross-products, computed as field elements)
-#   3. r1f = k - r2f·λ
-#   4. k1 = r1f, k2 = -r2f
+#   1. t1 = compute_endomorphism_k2(k)
+#      i.e. c1 = (endo_g2 · k) >> 256,  c2 = (endo_g1 · k) >> 256
+#           t1 = (c2·b2 - c1·(-b1)) mod r                             [= k2]
+#   2. k2 = t1
+#   3. k1 = (t1·λ + k) mod r
+#
+# No negative-k2 fix: unlike BN254, we do NOT check if t1 > 128 bits. The
+# biggroup code inspects the MSB of k2 to determine its sign.
 
-def split_scalar_384(k, modulus, lambda_val, endo_g1, endo_g2, endo_minus_b1, endo_b2):
+def split_scalar_secp256k1(k, modulus, lambda_val, endo_g1, endo_g2, endo_minus_b1, endo_b2):
     """
-    Split scalar k using the 384-bit shift variant.
+    Split scalar k using the 256-bit shift for secp256k1.
 
-    Implements split_into_endomorphism_scalars_384() in field_declarations.hpp.
+    Implements the large-modulus branch of split_into_endomorphism_scalars()
+    in field_declarations.hpp.
 
     Returns:
-        (k1, k2): The split scalars such that k ≡ k1 - λ·k2 (mod r)
+        (k1, k2): Full field elements such that k ≡ k1 - λ·k2 (mod r).
+                   k2 fits in ~129 bits; k1 fits in ~128 bits.
     """
     input_val = k % modulus
 
-    # c1 ≈ k·b2/r,  c2 ≈ k·(-b1)/r
-    c1 = (endo_g1 * input_val) >> 384
-    c2 = (endo_g2 * input_val) >> 384
-
-    # Cross-products (computed as field elements in C++ via Montgomery)
-    c1_times_minus_b1 = (c1 * endo_minus_b1) % modulus
-    c2_times_b2 = (c2 * endo_b2) % modulus
+    # c1 = (g2 * k) >> 256,  c2 = (g1 * k) >> 256
+    c1 = (endo_g2 * input_val) >> 256
+    c2 = (endo_g1 * input_val) >> 256
 
-    # r2f = c1·(-b1) - c2·b2 (nearly cancels, leaving small lattice error)
-    r2f = (c1_times_minus_b1 - c2_times_b2) % modulus
+    # q1 = c1 * (-b1),  q2 = c2 * b2  (raw integer multiply, low 256 bits)
+    q1_lo = (c1 * endo_minus_b1) & ((1 << 256) - 1)
+    q2_lo = (c2 * endo_b2) & ((1 << 256) - 1)
 
-    # r1f = k - r2f·λ
-    r1f = (input_val - r2f * lambda_val) % modulus
+    # t1 = (q2 - q1) mod r  — this is k2
+    t1 = (q2_lo - q1_lo) % modulus
 
-    # k1 = r1f, k2 = -r2f;  invariant: k ≡ k1 - λ·k2 (mod r)
-    k1 = r1f
-    k2 = (-r2f) % modulus
+    # k1 = t1·λ + k  (mod r)
+    k1 = (t1 * lambda_val + input_val) % modulus
+    k2 = t1
 
     return k1, k2
 
@@ -608,8 +630,8 @@ def split_scalar_384(k, modulus, lambda_val, endo_g1, endo_g2, endo_minus_b1, en
 # § 14. secp256k1 Fr — SPLITTING VERIFICATION
 # ====================================================================================
 
-def verify_split_384(k, k1, k2, lambda_val, modulus):
-    """Verify correctness of the 384-bit scalar split."""
+def verify_split_secp256k1(k, k1, k2, lambda_val, modulus):
+    """Verify correctness of the secp256k1 scalar split."""
     # The invariant is k ≡ k1 - λ·k2 (mod r)
     reconstructed = (k1 - lambda_val * k2) % modulus
     assert reconstructed == k % modulus, (
@@ -617,10 +639,8 @@ def verify_split_384(k, k1, k2, lambda_val, modulus):
         f"  k1={hex(k1)}, k2={hex(k2)}\n"
         f"  reconstructed={hex(reconstructed)}, expected={hex(k % modulus)}"
     )
-    # For the 384-bit variant, k1 and k2 are field elements; they should be small
-    # enough that the decomposition is useful. We verify they fit in ~129 bits.
-    # (The C++ code does not explicitly truncate to 128 bits in this path;
-    # the values may be slightly larger than in the 256-bit path.)
+    # k1 and k2 are full field elements. We verify their effective magnitudes
+    # fit in ~129 bits (the decomposition halves the scalar bit-length).
     k1_eff = k1 if k1 <= modulus // 2 else modulus - k1
     k2_eff = k2 if k2 <= modulus // 2 else modulus - k2
     assert k1_eff.bit_length() <= 129, (
@@ -632,38 +652,10 @@ def verify_split_384(k, k1, k2, lambda_val, modulus):
 
 
 for k_test in [0, 1, 42, secp_lambda, secp_r - 1, secp_r // 2, secp_r // 3]:
-    k1, k2 = split_scalar_384(
+    k1, k2 = split_scalar_secp256k1(
         k_test, secp_r, secp_lambda, secp_endo_g1, secp_endo_g2, secp_endo_minus_b1, secp_endo_b2
     )
-    verify_split_384(k_test, k1, k2, secp_lambda, secp_r)
-
-
-# Also verify with the cube root of unity in the BASE field (secp256k1 Fq).
-# The base field Fq of secp256k1 has modulus p = 2^256 - 2^32 - 977, which also
-# has a cube root of unity β. This β is what gets multiplied with the x-coordinate
-# in the endomorphism φ(x,y) = (β·x, y). Let's verify it.
-
-secp_fq_modulus = (
-    0xFFFFFFFEFFFFFC2F |
-    (0xFFFFFFFFFFFFFFFF << 64) |
-    (0xFFFFFFFFFFFFFFFF << 128) |
-    (0xFFFFFFFFFFFFFFFF << 192)
-)
-
-secp_fq_R = pow(2, 256, secp_fq_modulus)
-secp_fq_R_inv = pow(secp_fq_R, -1, secp_fq_modulus)
-
-secp_fq_cube_root_montgomery = (
-    0x58a4361c8e81894e |
-    (0x03fde1631c4b80af << 64) |
-    (0xf8e98978d02e3905 << 128) |
-    (0x7a4a36aebcbb3d53 << 192)
-)
-
-secp_fq_beta = (secp_fq_cube_root_montgomery * secp_fq_R_inv) % secp_fq_modulus
-assert pow(secp_fq_beta, 3, secp_fq_modulus) == 1, "β³ ≡ 1 (mod p) for secp256k1 Fq"
-assert secp_fq_beta != 1, "β must be non-trivial"
-
+    verify_split_secp256k1(k_test, k1, k2, secp_lambda, secp_r)
 
 # ====================================================================================
 # § 15. SUMMARY
@@ -672,11 +664,15 @@ def verify_split_384(k, k1, k2, lambda_val, modulus):
 # Derived and verified GLV endomorphism constants for:
 #   - BN254 Fr     (§1–§5):   254-bit, 256-bit shift, constants match bn254/fr.hpp
 #   - BN254 Fq     (§6–§9):   254-bit, 256-bit shift, constants match bn254/fq.hpp
-#   - secp256k1 Fr  (§10–§14): 256-bit, 384-bit shift, constants match secp256k1.hpp
-#   - secp256k1 Fq cube root β also verified (end of Part III)
-#
-# Architectural split: MODULUS_TOP_LIMB_LARGE_THRESHOLD (2^62) determines whether
-# split_into_endomorphism_scalars (256-bit) or _384 (384-bit) is used.
+#   - secp256k1 Fr  (§10–§14): 256-bit, 256-bit shift, constants match secp256k1.hpp
+#
+# ALL curves use the same 256-bit shift (see §0a for the proof that this is
+# sufficient for any r < 2^256). MODULUS_TOP_LIMB_LARGE_THRESHOLD (2^62)
+# determines whether the 128-bit pair path (with negative-k2 fix) or the
+# full-width path (no fix, caller handles signs) is used. The former is ONLY
+# called for BN254 fields, as their lattice basis is unusually short and hence
+# the splitting scalars comfortably fit in 128 bits; in general, for 256-bit fields,
+# k1 and k2 will have ~ 128 or 129 bits.
 
 if __name__ == "__main__":
     print("=== Part I: BN254 Fr ===")
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
index ed47d11f06b7..b83fdb489273 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
@@ -45,6 +45,11 @@ namespace bb {
 //
 // In Barretenberg, the main workhorse fields are the base and scalar fields of BN-254, which are "small" moduli: they
 // are each 254 bits. The field algorithms for them are constant-time.
+//
+// NOTE: For the 254-bit fields in Barretenberg, namely BN254 base and scalar fields, we also
+// use this constexpr branching to capture another (conceptually unrelated) property: that
+// the short basis of the lattice from the endomorphism is shorter than expected. See endomorphism_scalars.py for more
+// information.
 static constexpr uint64_t MODULUS_TOP_LIMB_LARGE_THRESHOLD = 0x4000000000000000ULL; // 2^62
 
 /**
@@ -466,71 +471,90 @@ template <class Params_> struct alignas(32) field {
      * We pre-compute scalars g1 = (2^256 * b1) / n, g2 = (2^256 * b2) / n, to avoid having to perform long division
      * on 512-bit scalars
      **/
-    static void split_into_endomorphism_scalars(const field& k, field& k1, field& k2)
-    {
-        // if the modulus is a >= 255-bit integer, we need to use a basis where g1, g2 have been shifted by 2^384
-        if constexpr (Params::modulus_3 >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
-            split_into_endomorphism_scalars_384(k, k1, k2);
-        } else {
-            std::pair<std::array<uint64_t, 2>, std::array<uint64_t, 2>> ret = split_into_endomorphism_scalars(k);
-            k1.data[0] = ret.first[0];
-            k1.data[1] = ret.first[1];
-
-#if !defined(__clang__) && defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-            k2.data[0] = ret.second[0]; // NOLINT
-            k2.data[1] = ret.second[1];
-#if !defined(__clang__) && defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-        }
-    }
-
-    // NOTE: this form is only usable if the modulus is 254 bits or less, otherwise see
-    // split_into_endomorphism_scalars_384.
-    // DOES NOT assume that the input is reduced; it can be in coarse form.
-    // TODO(https://github.com/AztecProtocol/barretenberg/issues/851): Unify these APIs.
-    static std::pair<std::array<uint64_t, 2>, std::array<uint64_t, 2>> split_into_endomorphism_scalars(const field& k)
+    /**
+     * @brief Shared core of the endomorphism scalar decomposition.
+     *
+     * Computes k2 = round(b2·k/r)·(-b1) + round((-b1)·k/r)·b2, using the
+     * 256-bit-shift approximation g = floor(b·2^256/r) for both BN254 and
+     * secp256k1. See endomorphism_scalars.py §0 for the proof that the
+     * approximation error is bounded to {0, -1} for any r < 2^256.
+     *
+     * The result is a raw (non-Montgomery) `field` whose low 128-or-129 bits
+     * hold k2. This function will be called in either the BN254 base/scalar field
+     * or the generic, secp256k1 branch.
+     */
+    static field compute_endomorphism_k2(const field& input)
     {
-        static_assert(Params::modulus_3 < MODULUS_TOP_LIMB_LARGE_THRESHOLD);
-        field input = k.reduce_once();
-
         constexpr field endo_g1 = { Params::endo_g1_lo, Params::endo_g1_mid, Params::endo_g1_hi, 0 };
-
         constexpr field endo_g2 = { Params::endo_g2_lo, Params::endo_g2_mid, 0, 0 };
-
         constexpr field endo_minus_b1 = { Params::endo_minus_b1_lo, Params::endo_minus_b1_mid, 0, 0 };
-
         constexpr field endo_b2 = { Params::endo_b2_lo, Params::endo_b2_mid, 0, 0 };
 
-        // compute c1 = (g2 * k) >> 256
+        // c1 = (g2 * k) >> 256,  c2 = (g1 * k) >> 256
         wide_array c1 = endo_g2.mul_512(input);
-        // compute c2 = (g1 * k) >> 256
         wide_array c2 = endo_g1.mul_512(input);
 
-        // (the bit shifts are implicit, as we only utilize the high limbs of c1, c2
+        // extract high halves
+        field c1_hi{ c1.data[4], c1.data[5], c1.data[6], c1.data[7] };
+        field c2_hi{ c2.data[4], c2.data[5], c2.data[6], c2.data[7] };
 
-        field c1_hi = {
-            c1.data[4], c1.data[5], c1.data[6], c1.data[7]
-        }; // *(field*)((uintptr_t)(&c1) + (4 * sizeof(uint64_t)));
-        field c2_hi = {
-            c2.data[4], c2.data[5], c2.data[6], c2.data[7]
-        }; // *(field*)((uintptr_t)(&c2) + (4 * sizeof(uint64_t)));
-
-        // compute q1 = c1 * -b1
+        // q1 = c1 * (-b1),  q2 = c2 * b2
         wide_array q1 = c1_hi.mul_512(endo_minus_b1);
-        // compute q2 = c2 * b2
         wide_array q2 = c2_hi.mul_512(endo_b2);
 
-        // FIX: Avoid using 512-bit multiplication as its not necessary.
-        // c1_hi, c2_hi can be uint256_t's and the final result (without montgomery reduction)
-        // could be casted to a field.
         field q1_lo{ q1.data[0], q1.data[1], q1.data[2], q1.data[3] };
         field q2_lo{ q2.data[0], q2.data[1], q2.data[2], q2.data[3] };
 
-        field t1 = (q2_lo - q1_lo).reduce_once();
+        return (q2_lo - q1_lo).reduce_once();
+    }
+
+    /**
+     * @brief Full-width endomorphism decomposition: k ≡ k1 - k2·λ (mod r).
+     * Modifies the field elements k1 and k2.
+     *
+     * For BN254 base/scalar fields, delegates to the 128-bit pair
+     * overload, which applies the negative-k2 fix. Returns k1, k2 in the low
+     * 2 limbs (upper limbs zeroed). Both fit in 128 bits.
+     *
+     * For generic 256-bit fields: returns k1, k2 as full field elements
+     * elements (non-Montgomery). k1 fits in ~128 bits; k2 fits in ~129 bits.
+     * No negative-k2 fix — the caller (biggroup_nafs.hpp) handles signs by
+     * inspecting the MSB of k2.
+     */
+    static void split_into_endomorphism_scalars(const field& k, field& k1, field& k2)
+    {
+        field input = k.reduce_once();
+        if constexpr (Params::modulus_3 < MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
+            // BN254 base or scalar field: use 128-bit path with negative-k2 fix.
+            auto ret = split_into_endomorphism_scalars(input);
+            k1 = { ret.first[0], ret.first[1], 0, 0 };
+            k2 = { ret.second[0], ret.second[1], 0, 0 };
+        } else {
+            // Large modulus (secp256k1): full-width path.
+            field t1 = compute_endomorphism_k2(input);
+            k2 = t1;
+            k1 = ((t1 * cube_root_of_unity()) + input).reduce_once();
+        }
+    }
+
+    /**
+     * @brief 128-bit endomorphism decomposition: k ≡ k1 - k2·λ (mod r).
+     *
+     * Returns { {k1_lo, k1_hi}, {k2_lo, k2_hi} } — each scalar as a pair of
+     * uint64_t representing its low 128 bits. Both k1 and k2 are guaranteed to
+     * fit in 128 bits (the negative-k2 fix ensures this for the ~2^{-64} of
+     * inputs where k2 would otherwise be slightly negative).
+     *
+     * Only valid for fields such that the splitting_scalars algorithm produces 128 bit outputs. In Barretenberg, these
+     * are just the base and scalar fields of BN254. These are the only "small modulus" fields, so we use a static
+     * assert to force this.
+     *
+     * Assumes that the input is in strict form, i.e., in the range [0, p).
+     */
+    static std::pair<std::array<uint64_t, 2>, std::array<uint64_t, 2>> split_into_endomorphism_scalars(const field& k)
+    {
+        static_assert(Params::modulus_3 < MODULUS_TOP_LIMB_LARGE_THRESHOLD);
+        field t1 = compute_endomorphism_k2(k);
 
         // k2 (= t1) can be slightly negative for ~2^{-64} of inputs.
         // When negative, t1 = k2 + r is 254 bits (upper limbs nonzero).
@@ -538,64 +562,17 @@ template <class Params_> struct alignas(32) field {
         // This shifts k2 by +|b1| (~127 bits, now positive) and k1 by -a1 (~64 bits),
         // keeping both within 128 bits. See endomorphism_scalars.py for more details.
         if (t1.data[2] != 0 || t1.data[3] != 0) {
+            constexpr field endo_minus_b1 = { Params::endo_minus_b1_lo, Params::endo_minus_b1_mid, 0, 0 };
             t1 = (t1 + endo_minus_b1).reduce_once();
         }
 
-        field beta = cube_root_of_unity();
-        field t2 = (t1 * beta + input).reduce_once();
+        field t2 = ((t1 * cube_root_of_unity()) + k).reduce_once();
         return {
             { t2.data[0], t2.data[1] },
             { t1.data[0], t1.data[1] },
         };
     }
 
-    static void split_into_endomorphism_scalars_384(const field& input, field& k1_out, field& k2_out)
-    {
-        constexpr field minus_b1f{
-            Params::endo_minus_b1_lo,
-            Params::endo_minus_b1_mid,
-            0,
-            0,
-        };
-        constexpr field b2f{
-            Params::endo_b2_lo,
-            Params::endo_b2_mid,
-            0,
-            0,
-        };
-        constexpr uint256_t g1{
-            Params::endo_g1_lo,
-            Params::endo_g1_mid,
-            Params::endo_g1_hi,
-            Params::endo_g1_hihi,
-        };
-        constexpr uint256_t g2{
-            Params::endo_g2_lo,
-            Params::endo_g2_mid,
-            Params::endo_g2_hi,
-            Params::endo_g2_hihi,
-        };
-
-        field kf = input.reduce_once();
-        uint256_t k{ kf.data[0], kf.data[1], kf.data[2], kf.data[3] };
-
-        uint512_t c1 = (uint512_t(k) * static_cast<uint512_t>(g1)) >> 384;
-        uint512_t c2 = (uint512_t(k) * static_cast<uint512_t>(g2)) >> 384;
-
-        field c1f{ c1.lo.data[0], c1.lo.data[1], c1.lo.data[2], c1.lo.data[3] };
-        field c2f{ c2.lo.data[0], c2.lo.data[1], c2.lo.data[2], c2.lo.data[3] };
-
-        c1f.self_to_montgomery_form();
-        c2f.self_to_montgomery_form();
-        c1f = c1f * minus_b1f;
-        c2f = c2f * b2f;
-        field r2f = c1f - c2f;
-        field beta = cube_root_of_unity();
-        field r1f = input.reduce_once() - r2f * beta;
-        k1_out = r1f;
-        k2_out = -r2f;
-    }
-
     // static constexpr auto coset_generators = compute_coset_generators();
     // static constexpr std::array<field, 15> coset_generators = compute_coset_generators((1 << 30U));
 

From 6aefc4127fa895c3967ccd74388a7b9ba06a9df3 Mon Sep 17 00:00:00 2001
From: notnotraju <raju@aztec-labs.com>
Date: Tue, 24 Feb 2026 16:47:03 +0000
Subject: [PATCH 6/8] fix the WASM issue by restructuring the `reduce_once()`
 calls.

---
 .../ecc/fields/field_declarations.hpp            | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
index b83fdb489273..3de47d38bb6d 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
@@ -483,8 +483,11 @@ template <class Params_> struct alignas(32) field {
      * hold k2. This function will be called in either the BN254 base/scalar field
      * or the generic, secp256k1 branch.
      */
-    static field compute_endomorphism_k2(const field& input)
+    static field compute_endomorphism_k2(const field& k)
     {
+        // force into strict form.
+        field input = k.reduce_once();
+
         constexpr field endo_g1 = { Params::endo_g1_lo, Params::endo_g1_mid, Params::endo_g1_hi, 0 };
         constexpr field endo_g2 = { Params::endo_g2_lo, Params::endo_g2_mid, 0, 0 };
         constexpr field endo_minus_b1 = { Params::endo_minus_b1_lo, Params::endo_minus_b1_mid, 0, 0 };
@@ -523,17 +526,16 @@ template <class Params_> struct alignas(32) field {
      */
     static void split_into_endomorphism_scalars(const field& k, field& k1, field& k2)
     {
-        field input = k.reduce_once();
         if constexpr (Params::modulus_3 < MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
-            // BN254 base or scalar field: use 128-bit path with negative-k2 fix.
-            auto ret = split_into_endomorphism_scalars(input);
+            // BN254 base or scalar field: use path that corresponds to 128-bit outputs.
+            auto ret = split_into_endomorphism_scalars(k);
             k1 = { ret.first[0], ret.first[1], 0, 0 };
             k2 = { ret.second[0], ret.second[1], 0, 0 };
         } else {
             // Large modulus (secp256k1): full-width path.
-            field t1 = compute_endomorphism_k2(input);
+            field t1 = compute_endomorphism_k2(k);
             k2 = t1;
-            k1 = ((t1 * cube_root_of_unity()) + input).reduce_once();
+            k1 = ((t1 * cube_root_of_unity()) + k).reduce_once();
         }
     }
 
@@ -549,7 +551,7 @@ template <class Params_> struct alignas(32) field {
      * are just the base and scalar fields of BN254. These are the only "small modulus" fields, so we use a static
      * assert to force this.
      *
-     * Assumes that the input is in strict form, i.e., in the range [0, p).
+     * Does NOT assume that the input is reduced
      */
     static std::pair<std::array<uint64_t, 2>, std::array<uint64_t, 2>> split_into_endomorphism_scalars(const field& k)
     {

From 0ae3cc1a5c811ba4192d6f67212c9c9a7fa01f31 Mon Sep 17 00:00:00 2001
From: notnotraju <raju@aztec-labs.com>
Date: Fri, 27 Feb 2026 11:04:50 +0000
Subject: [PATCH 7/8] updated the documentation to clarify when/how secp256k1
 split scalars can be bigger than 2^128.

---
 .../ecc/fields/endomorphism_scalars.py        | 79 ++++++++++++++-----
 1 file changed, 60 insertions(+), 19 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
index c3a139bdf265..5a07bf3ffe03 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
@@ -55,14 +55,20 @@
 # down through √p and below.  We stop at the first remainder r_j < √p and read off
 # two short lattice vectors from the Bézout coefficients at steps j−1 and j.
 #
-# The resulting vector sizes depend on the specific λ and p:
+# The resulting vector sizes are generically ~√p, but the exact sizes determine
+# whether the split scalars k1, k2 fit in 128 bits:
 #
-#   • BN254 (Fr and Fq): the curve is constructed from a 63-bit parameter x, and the
-#     lattice vectors are a1 = b2 = 2x+1 (64 bits), |b1| = 6x²+2x (127 bits).
-#     This asymmetric 64/127-bit pattern is a consequence of the BN parametrisation.
+#   • BN254 (Fr and Fq): p is 254 bits, so √p ~ 127 bits.  The lattice vectors
+#     have |b1| ≤ 127 bits, |b2| ≤ 64 bits (the asymmetry comes from the explicit BN).
+#     Since k2 = f1·|b1| - f2·b2 with f1,f2 ∈ [0,1), we get |k2| < 2^127,
+#     which fits comfortably in 128 bits with 1 bit of headroom. Similarly,
+#     |a1|  = 64 bits and |a2| = 127 bits, so similar logic applies for k1.
 #
-#   • secp256k1 Fr: no small generating parameter; the lattice vectors are all in the
-#     generic ~126–129-bit range (roughly √p for a 256-bit prime).
+#   • secp256k1 Fr: p is 256 bits, so √p ~ 128 bits.  The lattice basis has
+#     |b1| = 128 bits and |b2| = 126 bits. However, |a1| = 126 bits and |a2|
+#     = 129 bits. This implies that the naive bound only gives that |k1|
+#     ≤ 129 bits. Indeed, k1 is 129 bits ~25% of the time. It turns out that |k2|
+#     is 129 bits roughly 0.3% of the time.
 #
 
 from math import isqrt
@@ -486,8 +492,8 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 # 256-bit shift approximation is sufficient for ANY prime r < 2^256 — the
 # error is bounded to {0, -1}. The old 384-bit path was only needed because
 # the 256-bit C++ code truncated outputs to 128 bits, clipping the 129th bit
-# that appears for ~26% of secp256k1 inputs. With full-width output, 256-bit
-# shift works perfectly.
+# that appears for ~25% of inputs (k1) and ~0.3% of inputs (k2).  With
+# full-width output, 256-bit shift works perfectly.
 #
 # For secp256k1, the large-modulus branch of split_into_endomorphism_scalars
 # returns full field elements (not truncated to 128 bits). The caller
@@ -524,7 +530,8 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 # § 11. secp256k1 Fr — LATTICE BASIS
 # ====================================================================================
 #
-# See §0 for why these vectors are ~126–129 bits (unlike BN254's 64/127 pattern).
+# See §0 for the component sizes: |a1| = 126, |b1| = 128, |a2| = 129, |b2| = 126 bits
+# (unlike BN254's asymmetric 64/127 pattern).
 
 secp_a1, secp_b1, secp_a2, secp_b2 = find_short_lattice_basis(secp_lambda, secp_r)
 
@@ -580,8 +587,10 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 # ====================================================================================
 #
 # secp256k1 uses the same core computation as BN254 (compute_endomorphism_k2 in
-# field_declarations.hpp), but WITHOUT the negative-k2 fix. The result k2 is a
-# full field element (up to ~129 bits); the caller handles signs.
+# field_declarations.hpp), but WITHOUT the negative-k2 fix. Both k1 and k2 can
+# reach 129 bits (see §0 for why); the caller handles signs. This specifically means
+# that if k2 is (naively) negative, it will be returned as r - k2, a 256-bit number,
+# and the caller will then detect this and handle appropriately.
 #
 # ALGORITHM (the `else` branch of split_into_endomorphism_scalars in
 # field_declarations.hpp, for large moduli):
@@ -604,7 +613,7 @@ def split_scalar_secp256k1(k, modulus, lambda_val, endo_g1, endo_g2, endo_minus_
 
     Returns:
         (k1, k2): Full field elements such that k ≡ k1 - λ·k2 (mod r).
-                   k2 fits in ~129 bits; k1 fits in ~128 bits.
+                   Both k1 and k2 can reach ~129 bits (see §0).
     """
     input_val = k % modulus
 
@@ -669,10 +678,43 @@ def verify_split_secp256k1(k, k1, k2, lambda_val, modulus):
 # ALL curves use the same 256-bit shift (see §0a for the proof that this is
 # sufficient for any r < 2^256). MODULUS_TOP_LIMB_LARGE_THRESHOLD (2^62)
 # determines whether the 128-bit pair path (with negative-k2 fix) or the
-# full-width path (no fix, caller handles signs) is used. The former is ONLY
-# called for BN254 fields, as their lattice basis is unusually short and hence
-# the splitting scalars comfortably fit in 128 bits; in general, for 256-bit fields,
-# k1 and k2 will have ~ 128 or 129 bits.
+# full-width path (no fix, caller handles signs) is used. The 128-bit path
+# works for BN254 because its 254-bit modulus gives √r ~ 127 bits, leaving
+# one bit of headroom below 128. For 256-bit moduli like secp256k1, √r ~ 128
+# bits leaves zero headroom: k1 exceeds 128 bits ~25% of the time (since
+# |a2| = 129 bits) and k2 exceeds 128 bits ~0.3% of the time (see §16).
+
+# ====================================================================================
+# § 16. APPENDIX: 129-BIT SCALARS FOR secp256k1
+# ====================================================================================
+#
+# Empirically verify the 129-bit overflow frequencies from §0 by calling
+# split_scalar_secp256k1 (§13) on random inputs.
+
+def measure_overflow_frequency(n_samples=500_000):
+    """Measure the fraction of random scalars whose k1 or k2 exceeds 128 bits."""
+    import random
+    rng = random.Random(42)
+
+    count_k1 = 0
+    count_k2 = 0
+    for _ in range(n_samples):
+        k = rng.randrange(1, secp_r)
+        k1, k2 = split_scalar_secp256k1(
+            k, secp_r, secp_lambda,
+            secp_endo_g1, secp_endo_g2, secp_endo_minus_b1, secp_endo_b2
+        )
+        if k1.bit_length() > 128:
+            count_k1 += 1
+        if k2 <= (secp_r - (1 << 127)) and k2.bit_length() > 128: # k2 is naively in the range (-2^127, 2^129). therefore we throw away when k2 is negative.
+            count_k2 += 1
+
+    pct_k1 = 100 * count_k1 / n_samples
+    pct_k2 = 100 * count_k2 / n_samples
+    print(f"  k1 > 128 bits: {count_k1}/{n_samples} ({pct_k1:.1f}%)")
+    print(f"  k2 positive and > 128 bits: {count_k2}/{n_samples} ({pct_k2:.2f}%)")
+    assert 20 < pct_k1 < 35, f"Expected ~25% for k1, got {pct_k1}%"
+    assert 0.1 < pct_k2 < 1.0, f"Expected ~0.3% for k2, got {pct_k2}%"
 
 if __name__ == "__main__":
     print("=== Part I: BN254 Fr ===")
@@ -699,8 +741,7 @@ def verify_split_secp256k1(k, k1, k2, lambda_val, modulus):
     print(f"  endo_b2:       {hex(secp_endo_b2)}")
     print("  -> Constants match secp256k1.hpp FrParams")
 
-    print("\n=== secp256k1 Fq (base field) ===")
-    print(f"  β (cube root): {hex(secp_fq_beta)}")
-    print("  -> Cube root verified")
+    print("\n=== Appendix: 129-bit overflow frequency (secp256k1) ===")
+    measure_overflow_frequency()
 
     print("\nAll verifications passed!")

From 1784aa6d50607bdf69036462b191b60f9fe4d4fc Mon Sep 17 00:00:00 2001
From: notnotraju <raju@aztec-labs.com>
Date: Wed, 4 Mar 2026 13:38:57 +0000
Subject: [PATCH 8/8] updated endomorphism scalars documentation to explain why
 the short bases for fq and fr are so close to each other

---
 .../ecc/fields/endomorphism_scalars.py        | 71 +++++++++++++++++--
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
index 5a07bf3ffe03..69961ad7f210 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/endomorphism_scalars.py
@@ -5,10 +5,12 @@
 This document explains the "splitting scalars" algorithm in Barretenberg for all curves
 that admit an efficient endomorphism. We cover:
 
-  Part 0   (§0):     Preliminaries — the GLV lattice and how to find a short basis
-  Part I   (§1–§5):  BN254 Fr  — the scalar field of BN254 (254-bit, uses 2^256 shift)
-  Part II  (§6–§9):  BN254 Fq  — the base field of BN254 (254-bit, uses 2^256 shift)
-  Part III (§10–§14): secp256k1 Fr — the scalar field of secp256k1 (256-bit, now uses 2^256 shift)
+  Part 0   (§0):      Preliminaries — the GLV lattice and how to find a short basis
+  Part I   (§1–§5):   BN254 Fr  — the scalar field of BN254 (254-bit, uses 2^256 shift)
+  Part II  (§6–§9):   BN254 Fq  — the base field of BN254 (254-bit, uses 2^256 shift)
+  Part III (§10–§14):  secp256k1 Fr — the scalar field of secp256k1 (256-bit, now uses 2^256 shift)
+  Appendix (§16):      129-bit scalars for secp256k1
+  Appendix (§17):      Why the BN254 Fr and Fq lattice bases are nearly identical
 
 Reference: Gallant, Lambert, Vanstone, "Faster Point Multiplication on Elliptic Curves" (2001)
 
@@ -181,8 +183,12 @@ def find_short_lattice_basis(lambda_val, modulus):
 # § 1. BN254 Fr — FIELD PARAMETERS
 # ====================================================================================
 
+# The BN parameter x (see §17 for why the Fr and Fq lattice bases are nearly identical)
+x_bn = 4965661367192848881  # 0x44e992b44a6909f1, 63 bits
+
 # The scalar field modulus of BN254 (from bn254/fr.hpp)
 r = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
+assert r == 36*x_bn**4 + 36*x_bn**3 + 18*x_bn**2 + 6*x_bn + 1, "r = 36x⁴ + 36x³ + 18x² + 6x + 1"
 
 # Montgomery parameter: R = 2^256 mod r
 # This is needed because fr.hpp stores values in Montgomery form
@@ -217,7 +223,13 @@ def find_short_lattice_basis(lambda_val, modulus):
 a2 = 0x6f4d8248eeb859fd0be4e1541221250b  # 127 bits
 b2 = 0x89d3256894d213e3                   # 64 bits
 
-# NOTE: a remarkable feature of this short basis is that a1 == b2, and indeed -b1 is rather close to a2.
+# NOTE: a1 == b2 (= 2x+1) and a2 ≈ |b1| (= 6x²+4x+1 vs 6x²+2x).
+# This is a structural consequence of the BN parameterization; see §17 for the full explanation.
+# In particular, we can verify these polynomial identities:
+assert a1 == 2 * x_bn + 1, "a1 = 2x + 1"
+assert b2 == 2 * x_bn + 1, "b2 = 2x + 1"
+assert a2 == 6 * x_bn**2 + 4 * x_bn + 1, "a2 = 6x^2 + 4x + 1"
+assert -b1 == 6 * x_bn**2 + 2 * x_bn, "|b1| = 6x^2 + 2x"
 
 # Verify that the vectors are in the lattice: ai + λ·bi ≡ 0 (mod r)
 assert (a1 + lambda_val * b1) % r == 0, "Lattice vector 1 must satisfy a1 + λ·b1 ≡ 0"
@@ -275,7 +287,6 @@ def compute_splitting_constants(modulus, b1, b2):
 # ====================================================================================
 #
 # Computes (k1, k2) with k ≡ k1 - λ·k2 (mod r) and |k1|, |k2| < 2^128.
-# See §0 for the derivation (Babai's nearest plane).
 #
 # SUBTLETY — k2 CAN BE NEGATIVE:
 #
@@ -393,6 +404,8 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 
 # The base field modulus of BN254 (from bn254/fq.hpp)
 fq_modulus = 0x30644E72E131A029B85045B68181585D97816A916871CA8D3C208C16D87CFD47
+assert fq_modulus == 36*x_bn**4 + 36*x_bn**3 + 24*x_bn**2 + 6*x_bn + 1, "q = 36x⁴ + 36x³ + 24x² + 6x + 1"
+assert fq_modulus - r == 6 * x_bn**2, "q − r = 6x²"
 
 # Montgomery parameter for Fq: R = 2^256 mod q
 fq_R = pow(2, 256, fq_modulus)
@@ -430,6 +443,18 @@ def verify_split(k, k1, k2, t1, t2, lambda_val, modulus):
 fq_det = fq_a1 * fq_b2 - fq_a2 * fq_b1
 assert abs(fq_det) == fq_modulus, f"Fq lattice determinant must be ±q, got {fq_det}"
 
+# Verify polynomial structure and near-identity with Fr basis (see §17 for explanation):
+assert fq_a1 == 2 * x_bn, "Fq: a1 = 2x"
+assert fq_b2 == 2 * x_bn, "Fq: b2 = 2x"
+assert fq_a2 == 6 * x_bn**2 + 4 * x_bn + 1, "Fq: a2 = 6x² + 4x + 1 (same as Fr!)"
+assert -fq_b1 == 6 * x_bn**2 + 2 * x_bn + 1, "Fq: |b1| = 6x² + 2x + 1"
+
+# The Fr and Fq bases differ by at most 1 in each component:
+assert a1 - fq_a1 == 1, "a1: Fr has 2x+1, Fq has 2x"
+assert b2 - fq_b2 == 1, "b2: Fr has 2x+1, Fq has 2x"
+assert fq_a2 == a2, "a2: identical for both fields"
+assert (-fq_b1) - (-b1) == 1, "|b1|: Fq has 6x²+2x+1, Fr has 6x²+2x"
+
 
 # ====================================================================================
 # § 8. BN254 Fq — PRECOMPUTED CONSTANTS AND VERIFICATION
@@ -716,6 +741,40 @@ def measure_overflow_frequency(n_samples=500_000):
     assert 20 < pct_k1 < 35, f"Expected ~25% for k1, got {pct_k1}%"
     assert 0.1 < pct_k2 < 1.0, f"Expected ~0.3% for k2, got {pct_k2}%"
 
+# ====================================================================================
+# § 17. APPENDIX: WHY THE BN254 Fr AND Fq LATTICE BASES ARE NEARLY IDENTICAL
+# ====================================================================================
+#
+# BN254 is parameterized by a single integer x = 0x44e992b44a6909f1 (63 bits).
+# Both field primes are polynomials in x:
+#
+#     r = 36x⁴ + 36x³ + 18x² + 6x + 1    (scalar field, Fr)
+#     q = 36x⁴ + 36x³ + 24x² + 6x + 1    (base field, Fq)
+#
+# They differ by q − r = 6x² ≈ 2¹²⁷, tiny relative to the 254-bit primes.
+#
+# The GLV lattice basis vectors turn out to be simple polynomials in x:
+#
+#     Fr basis:  (a1, b1) = (2x+1, −(6x²+2x)),    (a2, b2) = (6x²+4x+1, 2x+1)
+#     Fq basis:  (a1, b1) = (2x,   −(6x²+2x+1)),  (a2, b2) = (6x²+4x+1, 2x  )
+#
+# These are verified by the determinant identities:
+#
+#     (2x+1)² + (6x²+4x+1)(6x²+2x)   = 36x⁴ + 36x³ + 18x² + 6x + 1 = r
+#     (2x)²   + (6x²+4x+1)(6x²+2x+1) = 36x⁴ + 36x³ + 24x² + 6x + 1 = q
+#
+# Notice a1 = b2 for both fields (the basis matrix is "almost symmetric"), and a2
+# is IDENTICAL for both. The only difference: Fr has the +1 on a1/b2, while Fq has
+# the +1 on |b1|. All four components differ by at most 1 between Fr and Fq.
+#
+# Reference: The BN prime parameterization is from Barreto–Naehrig (2006). The
+# polynomial structure of GLV short bases for CM curves is discussed in
+# B. Smith, "Easy scalar decompositions for efficient scalar multiplication
+# on elliptic curves and genus 2 Jacobians" (2013), eprint.iacr.org/2013/672.
+
+# ====================================================================================
+# § 18. Main function
+# ====================================================================================
 if __name__ == "__main__":
     print("=== Part I: BN254 Fr ===")
     print(f"  λ (cube root): {hex(lambda_val)}")