diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1e0f1e347..cf114d3f7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -40,6 +40,10 @@ jobs: run: cargo clippy --all-targets -- -D warnings - name: Run tests + # Temporary workaround for Windows path issue, see: + # https://github.com/nextest-rs/nextest/issues/1493#issuecomment-2106331574 + env: + RUSTUP_WINDOWS_PATH_ADD_BIN: 1 run: cargo nextest run --no-fail-fast --all-targets # doctests are special [^1] but this step does not incur a performance penalty [^2] diff --git a/cliff.toml b/cliff.toml index 35dc6c202..5322b0889 100644 --- a/cliff.toml +++ b/cliff.toml @@ -58,6 +58,7 @@ commit_parsers = [ { body = ".*security", group = " 🔒️ Security" }, { message = "^revert", group = " ⏪️ Revert" }, { message = "^style", group = " 🎨 Styling" }, + { message = "^support", skip = true }, ] protect_breaking_commits = false diff --git a/twenty-first/Cargo.toml b/twenty-first/Cargo.toml index 529ee9db4..930e95bef 100644 --- a/twenty-first/Cargo.toml +++ b/twenty-first/Cargo.toml @@ -67,15 +67,19 @@ name = "evaluation" harness = false [[bench]] -name = "poly_mod_reduce" +name = "extrapolation" harness = false [[bench]] -name = "interpolation" +name = "coset_extrapolation" harness = false [[bench]] -name = "poly_div" +name = "poly_mod_reduce" +harness = false + +[[bench]] +name = "interpolation" harness = false [[bench]] diff --git a/twenty-first/benches/coset_extrapolation.rs b/twenty-first/benches/coset_extrapolation.rs new file mode 100644 index 000000000..74f6fdd14 --- /dev/null +++ b/twenty-first/benches/coset_extrapolation.rs @@ -0,0 +1,37 @@ +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::BenchmarkId; +use criterion::Criterion; + +use twenty_first::math::other::random_elements; +use twenty_first::prelude::*; + +criterion_main!(benches); +criterion_group!( + name = benches; + config = Criterion::default().sample_size(10); + targets = coset_extrapolation<{ 1 << 18 }, { 1 << 8 }>, + coset_extrapolation<{ 1 << 19 }, { 1 << 8 }>, + coset_extrapolation<{ 1 << 20 }, { 1 << 8 }>, + coset_extrapolation<{ 1 << 21 }, { 1 << 8 }>, + coset_extrapolation<{ 1 << 22 }, { 1 << 8 }>, + coset_extrapolation<{ 1 << 23 }, { 1 << 8 }>, +); + +fn coset_extrapolation(c: &mut Criterion) { + let log2_of_size = SIZE.ilog2(); + let mut group = c.benchmark_group(format!( + "Fast extrapolation of length-{SIZE} codeword in {NUM_POINTS} Points" + )); + + let codeword = random_elements(SIZE); + let offset = BFieldElement::new(7); + let eval_points: Vec = random_elements(NUM_POINTS); + + let id = BenchmarkId::new("Fast Codeword Extrapolation", log2_of_size); + group.bench_function(id, |b| { + b.iter(|| Polynomial::::coset_extrapolate(offset, &codeword, &eval_points)) + }); + + group.finish(); +} diff --git a/twenty-first/benches/evaluation.rs b/twenty-first/benches/evaluation.rs index ae06c426c..6a6a107ce 100644 --- a/twenty-first/benches/evaluation.rs +++ b/twenty-first/benches/evaluation.rs @@ -4,6 +4,7 @@ use criterion::BenchmarkId; use criterion::Criterion; use twenty_first::math::other::random_elements; +use twenty_first::math::zerofier_tree::ZerofierTree; use twenty_first::prelude::*; criterion_main!(benches); @@ -36,15 +37,18 @@ fn evaluation(c: &mut Criterion) { b.iter(|| poly.iterative_batch_evaluate(&eval_points)) }); - // `vector_batch_evaluate` exists, but is super slow. Put it here if you plan to run benchmarks - // during a coffee break. + let id = BenchmarkId::new("Zerofier Tree", log2_of_size); + group.bench_function(id, |b| { + b.iter(|| ZerofierTree::new_from_domain(&eval_points)) + }); let id = BenchmarkId::new("Divide-and-Conquer", log2_of_size); + let zerofier_tree = ZerofierTree::new_from_domain(&eval_points); group.bench_function(id, |b| { - b.iter(|| poly.divide_and_conquer_batch_evaluate(&eval_points)) + b.iter(|| poly.divide_and_conquer_batch_evaluate(&zerofier_tree)) }); - let id = BenchmarkId::new("Dispatcher", log2_of_size); + let id = BenchmarkId::new("Entrypoint", log2_of_size); group.bench_function(id, |b| b.iter(|| poly.batch_evaluate(&eval_points))); group.finish(); diff --git a/twenty-first/benches/extrapolation.rs b/twenty-first/benches/extrapolation.rs new file mode 100644 index 000000000..0577b2d02 --- /dev/null +++ b/twenty-first/benches/extrapolation.rs @@ -0,0 +1,95 @@ +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::BenchmarkId; +use criterion::Criterion; + +use twenty_first::math::ntt::intt; +use twenty_first::math::other::random_elements; +use twenty_first::math::traits::PrimitiveRootOfUnity; +use twenty_first::math::zerofier_tree::ZerofierTree; +use twenty_first::prelude::*; + +criterion_main!(benches); +criterion_group!( + name = benches; + config = Criterion::default().sample_size(10); + targets = extrapolation<{ 1 << 18 }, { 1 << 6 }>, + extrapolation<{ 1 << 18 }, { 1 << 7 }>, + extrapolation<{ 1 << 18 }, { 1 << 8 }>, + extrapolation<{ 1 << 19 }, { 1 << 6 }>, + extrapolation<{ 1 << 19 }, { 1 << 7 }>, + extrapolation<{ 1 << 19 }, { 1 << 8 }>, + extrapolation<{ 1 << 20 }, { 1 << 6 }>, + extrapolation<{ 1 << 20 }, { 1 << 7 }>, + extrapolation<{ 1 << 20 }, { 1 << 8 }>, +); + +fn intt_then_evaluate( + codeword: &[BFieldElement], + offset: BFieldElement, + zerofier_tree: &ZerofierTree, + shift_coefficients: &[BFieldElement], + tail_length: usize, +) -> Vec { + let omega = BFieldElement::primitive_root_of_unity(codeword.len() as u64).unwrap(); + let log_domain_length = codeword.len().ilog2(); + let mut coefficients = codeword.to_vec(); + intt(&mut coefficients, omega, log_domain_length); + let polynomial: Polynomial = Polynomial::new(coefficients) + .scale(offset.inverse()) + .reduce_by_ntt_friendly_modulus(shift_coefficients, tail_length); + polynomial.divide_and_conquer_batch_evaluate(zerofier_tree) +} + +fn extrapolation(c: &mut Criterion) { + let log2_of_size = SIZE.ilog2(); + let mut group = c.benchmark_group(format!( + "Extrapolation of length-{SIZE} codeword in {NUM_POINTS} Points" + )); + + let codeword = random_elements(SIZE); + let offset = BFieldElement::new(7); + let eval_points: Vec = random_elements(NUM_POINTS); + + let zerofier_tree = ZerofierTree::new_from_domain(&eval_points); + let modulus = zerofier_tree.zerofier(); + let preprocessing_data = + Polynomial::fast_modular_coset_interpolate_preprocess(SIZE, offset, &modulus); + + let id = BenchmarkId::new("INTT-then-Evaluate", log2_of_size); + group.bench_function(id, |b| { + b.iter(|| { + intt_then_evaluate( + &codeword, + offset, + &zerofier_tree, + &preprocessing_data.shift_coefficients, + preprocessing_data.tail_length, + ) + }) + }); + + // We used to have another benchmark here that used barycentric evaluation + // (from `fri.rs` in repo triton-vm) inside of a loop over all points. It + // was never close to faster. + let id = BenchmarkId::new("Fast Codeword Extrapolation", log2_of_size); + group.bench_function(id, |b| { + b.iter(|| { + let minimal_interpolant = + Polynomial::::fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple( + &codeword, + offset, + &modulus, + &preprocessing_data + ); + minimal_interpolant.divide_and_conquer_batch_evaluate(&zerofier_tree) + }) + }); + + let id = BenchmarkId::new("Dispatcher (includes preprocessing)", log2_of_size); + group.bench_function(id, |b| { + b.iter(|| Polynomial::coset_extrapolate(offset, &codeword, &eval_points)) + }); + + group.finish(); +} diff --git a/twenty-first/benches/poly_div.rs b/twenty-first/benches/poly_div.rs deleted file mode 100644 index 8cf4193cc..000000000 --- a/twenty-first/benches/poly_div.rs +++ /dev/null @@ -1,44 +0,0 @@ -use criterion::criterion_group; -use criterion::criterion_main; -use criterion::BenchmarkId; -use criterion::Criterion; - -use twenty_first::math::other::random_elements; -use twenty_first::prelude::*; - -criterion_main!(benches); -criterion_group!( - name = benches; - config = Criterion::default(); - targets = poly_div<12, 6>, - poly_div<12, 8>, - poly_div<12, 12>, - poly_div<13, 6>, - poly_div<13, 8>, - poly_div<13, 12>, - poly_div<14, 6>, - poly_div<14, 8>, - poly_div<14, 12>, -); - -fn poly_div(c: &mut Criterion) { - let mut group = c.benchmark_group(format!( - "Division of Polynomials – \ - Dividend Degree: 2^{LOG2_NUM_DEG}, Divisor Degree: 2^{LOG2_DEN_DEG}" - )); - let bench_param = || format!("{LOG2_NUM_DEG}|{LOG2_DEN_DEG}"); - - let num = Polynomial::::new(random_elements((1 << LOG2_NUM_DEG) + 1)); - let den = Polynomial::::new(random_elements((1 << LOG2_DEN_DEG) + 1)); - - let id = BenchmarkId::new("Naïve", bench_param()); - group.bench_function(id, |b| b.iter(|| num.naive_divide(&den))); - - let id = BenchmarkId::new("Fast", bench_param()); - group.bench_function(id, |b| b.iter(|| num.fast_divide(&den))); - - let id = BenchmarkId::new("Faster of the two", bench_param()); - group.bench_function(id, |b| b.iter(|| num.divide(&den))); - - group.finish(); -} diff --git a/twenty-first/benches/poly_mod_reduce.rs b/twenty-first/benches/poly_mod_reduce.rs index d19498cb5..b7dc60e0c 100644 --- a/twenty-first/benches/poly_mod_reduce.rs +++ b/twenty-first/benches/poly_mod_reduce.rs @@ -30,12 +30,6 @@ fn poly_mod_reduce(c: &mut Criteri let id = BenchmarkId::new("long division", log2_of_size); group.bench_function(id, |b| b.iter(|| lhs.clone() % rhs.clone())); - // despite its name, `.fast_divide()` is slow – ignore for big inputs - if SIZE_LHS < 1 << 13 && SIZE_RHS < 1 << 13 { - let id = BenchmarkId::new("fast division", log2_of_size); - group.bench_function(id, |b| b.iter(|| lhs.fast_divide(&rhs))); - } - let id = BenchmarkId::new("fast reduce", log2_of_size); group.bench_function(id, |b| b.iter(|| lhs.fast_reduce(&rhs))); diff --git a/twenty-first/benches/zerofier.rs b/twenty-first/benches/zerofier.rs index 1d0243b7d..c7ce85a2d 100644 --- a/twenty-first/benches/zerofier.rs +++ b/twenty-first/benches/zerofier.rs @@ -35,7 +35,7 @@ fn zerofier(c: &mut Criterion) { let id = BenchmarkId::new("Fast", SIZE); group.bench_function(id, |b| b.iter(|| Polynomial::fast_zerofier(&roots))); - let id = BenchmarkId::new("Fastest of the three", SIZE); + let id = BenchmarkId::new("Dispatcher", SIZE); group.bench_function(id, |b| b.iter(|| Polynomial::zerofier(&roots))); group.finish(); diff --git a/twenty-first/src/lib.rs b/twenty-first/src/lib.rs index 1d175ade0..1318124fa 100644 --- a/twenty-first/src/lib.rs +++ b/twenty-first/src/lib.rs @@ -67,6 +67,12 @@ pub(crate) mod tests { implements_usual_auto_traits::>(); implements_usual_auto_traits::(); implements_usual_auto_traits::>(); + implements_usual_auto_traits::>(); + implements_usual_auto_traits::>(); + implements_usual_auto_traits::>(); + implements_usual_auto_traits::< + math::polynomial::ModularInterpolationPreprocessingData, + >(); } #[test] diff --git a/twenty-first/src/math.rs b/twenty-first/src/math.rs index 54323836e..b60f5403a 100644 --- a/twenty-first/src/math.rs +++ b/twenty-first/src/math.rs @@ -9,3 +9,4 @@ pub mod polynomial; pub mod tip5; pub mod traits; pub mod x_field_element; +pub mod zerofier_tree; diff --git a/twenty-first/src/math/b_field_element.rs b/twenty-first/src/math/b_field_element.rs index 9b4e5fc2c..5f628eee6 100644 --- a/twenty-first/src/math/b_field_element.rs +++ b/twenty-first/src/math/b_field_element.rs @@ -215,6 +215,9 @@ impl BFieldElement { /// 2^128 mod P; this is used for conversion of elements into Montgomery representation. const R2: u64 = 0xFFFFFFFE00000001; + /// -2^-1 + pub const MINUS_TWO_INVERSE: Self = Self::new(9223372034707292160); + #[inline] pub const fn new(value: u64) -> Self { Self(Self::montyred((value as u128) * (Self::R2 as u128))) @@ -1300,4 +1303,9 @@ mod b_prime_field_element_test { fn bfe_macro_produces_same_result_as_calling_new(value: u64) { prop_assert_eq!(BFieldElement::new(value), bfe!(value)); } + + #[test] + fn const_minus_two_inverse_is_really_minus_two_inverse() { + assert_eq!(bfe!(-2).inverse(), BFieldElement::MINUS_TWO_INVERSE); + } } diff --git a/twenty-first/src/math/polynomial.rs b/twenty-first/src/math/polynomial.rs index ae0fe3a92..cabd4b586 100644 --- a/twenty-first/src/math/polynomial.rs +++ b/twenty-first/src/math/polynomial.rs @@ -30,6 +30,7 @@ use crate::prelude::Inverse; use crate::prelude::XFieldElement; use super::traits::PrimitiveRootOfUnity; +use super::zerofier_tree::ZerofierTree; impl Zero for Polynomial { fn zero() -> Self { @@ -55,6 +56,17 @@ impl One for Polynomial { } } +/// Data produced by the preprocessing phase of a batch modular interpolation. +/// Marked `pub` for benchmarking purposes. Not part of the public API. +#[doc(hidden)] +#[derive(Debug, Clone)] +pub struct ModularInterpolationPreprocessingData { + pub even_zerofiers: Vec>, + pub odd_zerofiers: Vec>, + pub shift_coefficients: Vec, + pub tail_length: usize, +} + /// A univariate polynomial with coefficients in a [finite field](FiniteField), in monomial form. #[derive(Clone, Arbitrary)] pub struct Polynomial { @@ -134,21 +146,16 @@ where /// Extracted from `cargo bench --bench poly_mul` on mjolnir. const FAST_MULTIPLY_CUTOFF_THRESHOLD: isize = 1 << 8; - /// [Divide and conquer batch evaluation](Self::divide_and_conquer_batch_evaluate) - /// is slower than [iterative evaluation](Self::iterative_batch_evaluate) - /// on domains smaller than this threshold - const DIVIDE_AND_CONQUER_EVALUATE_CUTOFF_THRESHOLD: usize = 1 << 5; - /// Computing the [fast zerofier][fast] is slower than computing the [smart zerofier][smart] for /// domain sizes smaller than this threshold. The [naïve zerofier][naive] is always slower to /// compute than the [smart zerofier][smart] for domain sizes smaller than the threshold. /// - /// Extracted from `cargo bench --bench zerofier` on mjolnir. + /// Extracted from `cargo bench --bench zerofier`. /// /// [naive]: Self::naive_zerofier /// [smart]: Self::smart_zerofier /// [fast]: Self::fast_zerofier - const FAST_ZEROFIER_CUTOFF_THRESHOLD: usize = 200; + const FAST_ZEROFIER_CUTOFF_THRESHOLD: usize = 100; /// [Fast interpolation](Self::fast_interpolate) is slower than /// [Lagrange interpolation](Self::lagrange_interpolate) below this threshold. @@ -156,6 +163,22 @@ where /// Extracted from `cargo bench --bench interpolation` on mjolnir. const FAST_INTERPOLATE_CUTOFF_THRESHOLD: usize = 1 << 8; + /// Regulates the recursion depth at which + /// [Fast modular coset interpolation](Self::fast_modular_coset_interpolate) + /// is slower and switches to + /// [Lagrange interpolation](Self::lagrange_interpolate). + const FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_LAGRANGE: usize = 1 << 8; + + /// Regulates the recursion depth at which + /// [Fast modular coset interpolation](Self::fast_modular_coset_interpolate) + /// is slower and switches to [INTT](ntt::intt)-then-[reduce](Self::reduce). + const FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_INTT: usize = 1 << 17; + + /// Regulates when to prefer the [Fast coset extrapolation](Self::fast_coset_extrapolate) + /// over the [naïve method](Self::naive_coset_extrapolate). Threshold found + /// using `cargo criterion --bench extrapolation`. + const FAST_COSET_EXTRAPOLATE_THRESHOLD: usize = 100; + /// Inside `formal_power_series_inverse`, when to multiply naively and when /// to use NTT-based multiplication. Use benchmark /// `formal_power_series_inverse` to find the optimum. Based on benchmarks, @@ -175,9 +198,6 @@ where /// when. const REDUCE_BEFORE_EVALUATE_THRESHOLD_RATIO: isize = 4; - /// Regulates when to use long division and when to use NTT-based divide. - const FAST_DIVIDE_THRESHOLD: isize = 1000; - /// Return the polynomial which corresponds to the transformation `x → α·x`. /// /// Given a polynomial P(x), produce P'(x) := P(α·x). Evaluating P'(x) then corresponds to @@ -347,6 +367,51 @@ where Self::new(hadamard_product) } + /// Multiply a bunch of polynomials together. + pub fn batch_multiply(factors: &[Self]) -> Self { + // Build a tree-like structure of multiplications to keep the degrees of the + // factors roughly equal throughout the process. This makes efficient use of + // the `.multiply()` dispatcher. + // In contrast, using a simple `.reduce()`, the accumulator polynomial would + // have a much higher degree than the individual factors. + // todo: benchmark the current approach against the “reduce” approach. + + if factors.is_empty() { + return Polynomial::one(); + } + let mut products = factors.to_vec(); + while products.len() != 1 { + products = products + .chunks(2) + .map(|chunk| match chunk.len() { + 2 => chunk[0].multiply(&chunk[1]), + 1 => chunk[0].clone(), + _ => unreachable!(), + }) + .collect(); + } + products.pop().unwrap() + } + + /// Parallel version of [`batch_multiply`](Self::batch_multiply). + pub fn par_batch_multiply(factors: &[Self]) -> Self { + if factors.is_empty() { + return Polynomial::one(); + } + let num_threads = available_parallelism() + .map(|non_zero_usize| non_zero_usize.get()) + .unwrap_or(1); + let mut products = factors.to_vec(); + while products.len() != 1 { + let chunk_size = usize::max(2, products.len() / num_threads); + products = products + .par_chunks(chunk_size) + .map(Self::batch_multiply) + .collect(); + } + products.pop().unwrap() + } + /// Compute the lowest degree polynomial with the provided roots. /// Also known as “vanishing polynomial.” /// @@ -372,6 +437,25 @@ where } } + /// Parallel version of [`zerofier`](Self::zerofier). + pub fn par_zerofier(roots: &[FF]) -> Self { + if roots.is_empty() { + return Self::one(); + } + let num_threads = available_parallelism() + .map(|non_zero_usize| non_zero_usize.get()) + .unwrap_or(1); + let chunk_size = roots + .len() + .div_ceil(num_threads) + .max(Self::FAST_ZEROFIER_CUTOFF_THRESHOLD); + let factors = roots + .par_chunks(chunk_size) + .map(|chunk| Self::zerofier(chunk)) + .collect::>(); + Self::par_batch_multiply(&factors) + } + /// Only `pub` to allow benchmarking; not considered part of the public API. #[doc(hidden)] pub fn smart_zerofier(roots: &[FF]) -> Self { @@ -392,10 +476,8 @@ where #[doc(hidden)] pub fn fast_zerofier(roots: &[FF]) -> Self { let mid_point = roots.len() / 2; - let (left, right) = rayon::join( - || Self::zerofier(&roots[..mid_point]), - || Self::zerofier(&roots[mid_point..]), - ); + let left = Self::zerofier(&roots[..mid_point]); + let right = Self::zerofier(&roots[mid_point..]); left.multiply(&right) } @@ -756,25 +838,34 @@ where /// Evaluate the polynomial on a batch of points. pub fn batch_evaluate(&self, domain: &[FF]) -> Vec { - if self.degree() >= Self::REDUCE_BEFORE_EVALUATE_THRESHOLD_RATIO * (domain.len() as isize) { + if self.is_zero() { + vec![FF::zero(); domain.len()] + } else if self.degree() + >= Self::REDUCE_BEFORE_EVALUATE_THRESHOLD_RATIO * (domain.len() as isize) + { self.reduce_then_batch_evaluate(domain) - } else if domain.len() >= Self::DIVIDE_AND_CONQUER_EVALUATE_CUTOFF_THRESHOLD { - self.divide_and_conquer_batch_evaluate(domain) } else { - self.iterative_batch_evaluate(domain) + let zerofier_tree = ZerofierTree::new_from_domain(domain); + self.divide_and_conquer_batch_evaluate(&zerofier_tree) } } fn reduce_then_batch_evaluate(&self, domain: &[FF]) -> Vec { - let zerofier = Self::zerofier(domain); + let zerofier_tree = ZerofierTree::new_from_domain(domain); + let zerofier = zerofier_tree.zerofier(); let remainder = self.fast_reduce(&zerofier); - remainder.batch_evaluate(domain) + remainder.divide_and_conquer_batch_evaluate(&zerofier_tree) } /// Parallel version of [`batch_evaluate`](Self::batch_evaluate). pub fn par_batch_evaluate(&self, domain: &[FF]) -> Vec { - let num_threads = available_parallelism().map(|nzu| nzu.get()).unwrap_or(1); - let chunk_size = domain.len().next_multiple_of(num_threads) / num_threads; + if domain.is_empty() || self.is_zero() { + return vec![FF::zero(); domain.len()]; + } + let num_threads = available_parallelism() + .map(|non_zero_usize| non_zero_usize.get()) + .unwrap_or(1); + let chunk_size = domain.len().div_ceil(num_threads); domain .par_chunks(chunk_size) .flat_map(|ch| self.batch_evaluate(ch)) @@ -787,37 +878,20 @@ where domain.iter().map(|&p| self.evaluate(p)).collect() } - /// Only marked `pub` for benchmarking; not considered part of the public API. - #[doc(hidden)] - pub fn par_evaluate(&self, domain: &[FF]) -> Vec { - domain.par_iter().map(|&p| self.evaluate(p)).collect() - } - /// Only `pub` to allow benchmarking; not considered part of the public API. #[doc(hidden)] - pub fn vector_batch_evaluate(&self, domain: &[FF]) -> Vec { - let mut accumulators = vec![FF::zero(); domain.len()]; - for &c in self.coefficients.iter().rev() { - accumulators - .par_iter_mut() - .zip(domain) - .for_each(|(acc, &x)| *acc = *acc * x + c); + pub fn divide_and_conquer_batch_evaluate(&self, zerofier_tree: &ZerofierTree) -> Vec { + match zerofier_tree { + ZerofierTree::Leaf(leaf) => self + .reduce(&zerofier_tree.zerofier()) + .iterative_batch_evaluate(&leaf.points), + ZerofierTree::Branch(branch) => [ + self.divide_and_conquer_batch_evaluate(&branch.left), + self.divide_and_conquer_batch_evaluate(&branch.right), + ] + .concat(), + ZerofierTree::Padding => vec![], } - - accumulators - } - - /// Only `pub` to allow benchmarking; not considered part of the public API. - #[doc(hidden)] - pub fn divide_and_conquer_batch_evaluate(&self, domain: &[FF]) -> Vec { - let mid_point = domain.len() / 2; - let left_half = &domain[..mid_point]; - let right_half = &domain[mid_point..]; - - [left_half, right_half] - .into_iter() - .flat_map(|half_domain| self.batch_evaluate(half_domain)) - .collect() } /// Fast evaluate on a coset domain, which is the group generated by `generator^i * offset`. @@ -890,11 +964,9 @@ where /// /// Panics if the `divisor` is zero. pub fn divide(&self, divisor: &Self) -> (Self, Self) { - if self.degree() > Self::FAST_DIVIDE_THRESHOLD { - self.fast_divide(divisor) - } else { - self.naive_divide(divisor) - } + // There is an NTT-based division algorithm, but for no practical + // parameter set is it faster than long division. + self.naive_divide(divisor) } /// Compute a polynomial g(X) from a given polynomial f(X) such that @@ -1095,7 +1167,16 @@ where let reverse = self.reverse(); let inverse_reverse = reverse.formal_power_series_inverse_minimal(n as usize); let product_reverse = reverse.multiply(&inverse_reverse); - product_reverse.reverse() + + // Correct for lost trailing coefficients, if any + let product = product_reverse.reverse(); + let product_degree = product.degree(); + let shift = if product_degree < n { + n - product_degree + } else { + 0 + }; + product.shift_coefficients(shift as usize) } /// Given a polynomial f(X) and an integer n, find a multiple of f(X) of the @@ -1114,7 +1195,9 @@ where [vec![FF::from(0); n], vec![self.coefficients[0].inverse()]].concat(), ); } + let reverse = self.reverse(); + // The next function gives back a polynomial g(X) of degree at most arg, // such that f(X) * g(X) = 1 mod X^arg. // Without modular reduction, the degree of the product f(X) * g(X) is @@ -1122,7 +1205,11 @@ where // and arg = n - deg(f). let inverse_reverse = reverse.formal_power_series_inverse_minimal(n - degree); let product_reverse = reverse.multiply(&inverse_reverse); - product_reverse.reverse() + let product = product_reverse.reverse(); + + // Coefficient reversal drops trailing zero. Correct for that. + let product_degree = product.degree() as usize; + product.shift_coefficients(n - product_degree) } /// Reduces f(X) by a structured modulus, which is of the form @@ -1130,16 +1217,15 @@ where /// form, polynomial modular reductions can be computed faster than in the /// generic case. fn reduce_by_structured_modulus(&self, multiple: &Self) -> Self { - let leading_term = Polynomial::new( - [ - vec![FF::from(0); multiple.degree() as usize], - vec![FF::from(1); 1], - ] - .concat(), - ); + assert_ne!(multiple.degree(), 0); + let multiple_degree = usize::try_from(multiple.degree()).expect("cannot reduce by zero"); + let leading_term = + Polynomial::new([vec![FF::from(0); multiple_degree], vec![FF::from(1); 1]].concat()); let shift_polynomial = multiple.clone() - leading_term.clone(); - let m = shift_polynomial.degree() as usize + 1; - let n = multiple.degree() as usize - m; + let m = usize::try_from(shift_polynomial.degree()) + .map(|unsigned_degree| unsigned_degree + 1) + .unwrap_or(0); + let n = multiple_degree - m; if self.coefficients.len() < n + m { return self.clone(); } @@ -1177,28 +1263,37 @@ where /// /// This method uses NTT-based multiplication, meaning that the unstructured /// part of the structured multiple must be given in NTT-domain. - fn reduce_by_ntt_friendly_modulus(&self, shift_ntt: &[FF], m: usize) -> Self { + /// + /// This function is marked `pub` for benchmarking. Not considered part of + /// the public API + #[doc(hidden)] + pub fn reduce_by_ntt_friendly_modulus(&self, shift_ntt: &[FF], tail_length: usize) -> Self { let domain_length = shift_ntt.len(); assert!(domain_length.is_power_of_two()); let log_domain_length = domain_length.ilog2(); let omega = BFieldElement::primitive_root_of_unity(domain_length as u64).unwrap(); - let n = domain_length - m; + let chunk_size = domain_length - tail_length; - if self.coefficients.len() < n + m { + if self.coefficients.len() < chunk_size + tail_length { return self.clone(); } - let num_reducible_chunks = (self.coefficients.len() - (m + n) + n - 1) / n; + let num_reducible_chunks = + (self.coefficients.len() - (tail_length + chunk_size)).div_ceil(chunk_size); - let range_start = num_reducible_chunks * n; + let range_start = num_reducible_chunks * chunk_size; let mut working_window = if range_start >= self.coefficients.len() { - vec![FF::from(0); n + m] + vec![FF::from(0); chunk_size + tail_length] } else { self.coefficients[range_start..].to_vec() }; - working_window.resize(n + m, FF::from(0)); + working_window.resize(chunk_size + tail_length, FF::from(0)); for chunk_index in (0..num_reducible_chunks).rev() { - let mut product = [working_window[m..].to_vec(), vec![FF::from(0); m]].concat(); + let mut product = [ + working_window[tail_length..].to_vec(), + vec![FF::from(0); tail_length], + ] + .concat(); ntt(&mut product, omega, log_domain_length); product .iter_mut() @@ -1206,12 +1301,20 @@ where .for_each(|(l, r)| *l *= *r); intt(&mut product, omega, log_domain_length); - working_window = [vec![FF::from(0); n], working_window[0..m].to_vec()].concat(); - for (i, wwi) in working_window.iter_mut().enumerate().take(n) { - *wwi = self.coefficients[chunk_index * n + i]; + working_window = [ + vec![FF::from(0); chunk_size], + working_window[0..tail_length].to_vec(), + ] + .concat(); + for (i, wwi) in working_window.iter_mut().enumerate().take(chunk_size) { + *wwi = self.coefficients[chunk_index * chunk_size + i]; } - for (i, wwi) in working_window.iter_mut().enumerate().take(n + m) { + for (i, wwi) in working_window + .iter_mut() + .enumerate() + .take(chunk_size + tail_length) + { *wwi -= product[i]; } } @@ -1219,6 +1322,34 @@ where Polynomial::new(working_window) } + /// Only marked `pub` for benchmarking purposes. Not considered part of the + /// public API. + #[doc(hidden)] + pub fn shift_factor_ntt_with_tail_size(&self) -> (Vec, usize) { + let n = usize::max( + Polynomial::::FAST_REDUCE_CUTOFF_THRESHOLD, + self.degree() as usize * 2, + ) + .next_power_of_two(); + let ntt_friendly_multiple = self.structured_multiple_of_degree(n); + // m = 1 + degree(ntt_friendly_multiple - leading term) + let m = 1 + ntt_friendly_multiple + .coefficients + .iter() + .enumerate() + .rev() + .skip(1) + .find_map(|(i, c)| if !c.is_zero() { Some(i) } else { None }) + .unwrap_or(0); + let mut shift_factor_ntt = ntt_friendly_multiple.coefficients[..n].to_vec(); + ntt( + &mut shift_factor_ntt, + BFieldElement::primitive_root_of_unity(n as u64).unwrap(), + n.ilog2(), + ); + (shift_factor_ntt, m) + } + /// Compute the remainder after division of one polynomial by another. This /// method first reduces the numerator by a multiple of the denominator that /// was constructed to enable NTT-based chunk-wise reduction, before @@ -1242,28 +1373,10 @@ where // |- n=2^k coefficients. // This allows us to reduce the numerator's coefficients in chunks of // n-m using NTT-based multiplication over a domain of size n = 2^k. - let n = usize::max( - Polynomial::::FAST_REDUCE_CUTOFF_THRESHOLD, - modulus.degree() as usize * 2, - ) - .next_power_of_two(); - let ntt_friendly_multiple = modulus.structured_multiple_of_degree(n); - // m = 1 + degree(ntt_friendly_multiple - leading term) - let m = 1 + ntt_friendly_multiple - .coefficients - .iter() - .enumerate() - .rev() - .skip(1) - .find_map(|(i, c)| if !c.is_zero() { Some(i) } else { None }) - .unwrap_or(0); - let mut shift_factor_ntt = ntt_friendly_multiple.coefficients[..n].to_vec(); - ntt( - &mut shift_factor_ntt, - BFieldElement::primitive_root_of_unity(n as u64).unwrap(), - n.ilog2(), - ); - let mut intermediate_remainder = self.reduce_by_ntt_friendly_modulus(&shift_factor_ntt, m); + + let (shift_factor_ntt, tail_size) = modulus.shift_factor_ntt_with_tail_size(); + let mut intermediate_remainder = + self.reduce_by_ntt_friendly_modulus(&shift_factor_ntt, tail_size); // 2. Chunk-wise reduction with schoolbook multiplication. // We generate a smaller structured multiple of the denominator that @@ -1281,50 +1394,6 @@ where intermediate_remainder.reduce_long_division(modulus) } - /// Polynomial long division with `self` as the dividend, divided by some `divisor`. - /// Only `pub` to allow benchmarking; not considered part of the public API. - /// - /// As the name implies, the advantage of this method over [`divide`](Self::naive_divide) is - /// runtime complexity. Concretely, this method has time complexity in O(n·log(n)), whereas - /// [`divide`](Self::naive_divide) has time complexity in O(n^2). - #[doc(hidden)] - pub fn fast_divide(&self, divisor: &Self) -> (Self, Self) { - // The math for this function: [0]. There are very slight deviations, for example around the - // requirement that the divisor is monic. - // - // [0] https://cs.uwaterloo.ca/~r5olivei/courses/2021-winter-cs487/lecture5-post.pdf - - let Ok(quotient_degree) = usize::try_from(self.degree() - divisor.degree()) else { - return (Self::zero(), self.clone()); - }; - - if divisor.degree() == 0 { - let quotient = self.scalar_mul(divisor.leading_coefficient().unwrap().inverse()); - let remainder = Polynomial::zero(); - return (quotient, remainder); - } - - // Reverse coefficient vectors to move into formal power series ring over FF, i.e., FF[[x]]. - // Re-interpret as a polynomial to benefit from the already-implemented multiplication - // method, which mechanically work the same in FF[X] and FF[[x]]. - let reverse = |poly: &Self| Self::new(poly.coefficients.iter().copied().rev().collect()); - - // Newton iteration to invert divisor up to required precision. Why is this the required - // precision? Good question. - let precision = (quotient_degree + 1).next_power_of_two(); - - let rev_divisor = reverse(divisor); - let rev_divisor_inverse = rev_divisor.formal_power_series_inverse_newton(precision); - - let self_reverse = reverse(self); - let rev_quotient = self_reverse.multiply(&rev_divisor_inverse); - - let quotient = reverse(&rev_quotient).truncate(quotient_degree); - - let remainder = self.clone() - quotient.multiply(divisor); - (quotient, remainder) - } - /// The degree-`k` polynomial with the same `k + 1` leading coefficients as `self`. To be more /// precise: The degree of the result will be the minimum of `k` and [`Self::degree()`]. This /// implies, among other things, that if `self` [is zero](Self::is_zero()), the result will also @@ -1360,6 +1429,397 @@ where let num_coefficients_to_retain = n.min(self.coefficients.len()); Self::new(self.coefficients[..num_coefficients_to_retain].into()) } + + /// Preprocessing data for + /// [fast modular coset interpolation](Self::fast_modular_coset_interpolate). + /// Marked `pub` for benchmarking. Not considered part of the public API. + #[doc(hidden)] + pub fn fast_modular_coset_interpolate_preprocess( + n: usize, + offset: BFieldElement, + modulus: &Polynomial, + ) -> ModularInterpolationPreprocessingData { + let omega = BFieldElement::primitive_root_of_unity(n as u64).unwrap(); + // a list of polynomials whose ith element is X^(2^i) mod m(X) + let modular_squares = (0..n.ilog2()) + .scan( + Polynomial::::new(vec![FF::from(0), FF::from(1)]), + |acc, _| { + let yld = acc.clone(); + *acc = acc.multiply(acc).reduce(modulus); + Some(yld) + }, + ) + .collect_vec(); + let even_zerofiers = (0..n.ilog2()) + .map(|i| offset.inverse().mod_pow(1u64 << i)) + .zip(modular_squares.iter()) + .map(|(lc, sq)| sq.scalar_mul(FF::from(lc.value())) - Polynomial::one()) + .collect_vec(); + let odd_zerofiers = (0..n.ilog2()) + .map(|i| (offset * omega).inverse().mod_pow(1u64 << i)) + .zip(modular_squares.iter()) + .map(|(lc, sq)| sq.scalar_mul(FF::from(lc.value())) - Polynomial::one()) + .collect_vec(); + + // precompute NTT-friendly multiple of the modulus + let (shift_coefficients, tail_length) = modulus.shift_factor_ntt_with_tail_size(); + + ModularInterpolationPreprocessingData { + even_zerofiers, + odd_zerofiers, + shift_coefficients, + tail_length, + } + } + + /// Compute f(X) mod m(X) where m(X) is a given modulus and f(X) is the + /// interpolant of a list of n values on a domain which is a coset of the + /// size-n subgroup that is identified by some offset. + fn fast_modular_coset_interpolate( + values: &[FF], + offset: BFieldElement, + modulus: &Polynomial, + ) -> Self { + let preprocessing_data = + Self::fast_modular_coset_interpolate_preprocess(values.len(), offset, modulus); + Self::fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple( + values, + offset, + modulus, + &preprocessing_data, + ) + } + + /// Only marked `pub` for benchmarking purposes. Not considered part of the + /// interface. + #[doc(hidden)] + pub fn fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple( + values: &[FF], + offset: BFieldElement, + modulus: &Polynomial, + preprocessed: &ModularInterpolationPreprocessingData, + ) -> Self { + if modulus.is_zero() { + panic!("cannot reduce modulo zero") + }; + let n = values.len(); + let omega = BFieldElement::primitive_root_of_unity(n as u64).unwrap(); + + if n < Self::FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_LAGRANGE { + let domain = (0..n) + .scan(FF::from(offset.value()), |acc: &mut FF, _| { + let yld = *acc; + *acc *= omega; + Some(yld) + }) + .collect::>(); + let interpolant = Self::lagrange_interpolate(&domain, values); + return interpolant.reduce(modulus); + } else if n <= Self::FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_INTT { + let mut coefficients = values.to_vec(); + intt(&mut coefficients, omega, n.ilog2()); + let interpolant = Polynomial::new(coefficients); + + return interpolant + .scale(FF::from(offset.inverse().value())) + .reduce_by_ntt_friendly_modulus( + &preprocessed.shift_coefficients, + preprocessed.tail_length, + ) + .reduce(modulus); + } + + // Use even-odd domain split. + // Even: {offset * omega^{2*i} | i in 0..n/2} + // Odd: {offset * omega^{2*i+1} | i in 0..n/2} + // = {(offset * omega) * omega^{2*i} | i in 0..n/2} + // But we don't actually need to represent the domains explicitly. + + // 1. Get zerofiers. + // The zerofiers are sparse because the domain is structured. + // Even: (offset^-1 * X)^(n/2) - 1 = offset^{-n/2} * X^{n/2} - 1 + // Odd: ((offset * omega)^-1 * X)^(n/2) - 1 + // = offset^{-n/2} * omega^{n/2} * X^{n/2} - 1 + // Note that we are getting the (modularly reduced) zerofiers as + // function arguments. + + // 2. Evaluate zerofiers on opposite domains. + // Actually, the values are compressible because the zerofiers are + // sparse and the domains are structured (compatibly). + // Even zerofier on odd domain: + // (offset^-1 * X)^(n/2) - 1 on + // {(offset * omega) * omega^{2*i} | i in 0..n/2} + // = {omega^{n/2}-1 | i in 0..n/2} = {-2, -2, -2, ...} + // Odd zerofier on even domain: {omega^-i | i in 0..n/2} + // ((offset * omega)^-1 * X)^(n/2) - 1 + // on {offset * omega^{2*i} | i in 0..n/2} + // = {omega^{-n/2} - 1 | i in 0..n/2} = {-2, -2, -2, ...} + // Since these values are always the same, there's no point generating + // them at runtime. Moreover, we need their batch-inverses in the next + // step. + + // 3. Batch-invert zerofiers on opposite domains. + // The batch-inversion is actually not performed because we already know + // the result: {(-2)^-1, (-2)^-1, (-2)^-1, ...}. + const MINUS_TWO_INVERSE: BFieldElement = BFieldElement::MINUS_TWO_INVERSE; + let even_zerofier_on_odd_domain_inverted = vec![FF::from(MINUS_TWO_INVERSE.value()); n / 2]; + let odd_zerofier_on_even_domain_inverted = vec![FF::from(MINUS_TWO_INVERSE.value()); n / 2]; + + // 4. Construct interpolation values through Hadamard products. + let mut odd_domain_targets = even_zerofier_on_odd_domain_inverted; + let mut even_domain_targets = odd_zerofier_on_even_domain_inverted; + for i in 0..(n / 2) { + even_domain_targets[i] *= values[2 * i]; + odd_domain_targets[i] *= values[2 * i + 1]; + } + + // 5. Interpolate using recursion + let even_interpolant = + Self::fast_modular_coset_interpolate(&even_domain_targets, offset, modulus); + let odd_interpolant = + Self::fast_modular_coset_interpolate(&odd_domain_targets, offset * omega, modulus); + + // 6. Multiply with zerofiers and add. + let interpolant = even_interpolant + .multiply(&preprocessed.odd_zerofiers[(n / 2).ilog2() as usize]) + + odd_interpolant.multiply(&preprocessed.even_zerofiers[(n / 2).ilog2() as usize]); + + // 7. Reduce by modulus and return. + interpolant.reduce(modulus) + } + + /// Extrapolate a Reed-Solomon codeword, defined relative to a coset of the + /// subgroup of order n (codeword length), in new points. + pub fn coset_extrapolate( + domain_offset: BFieldElement, + codeword: &[FF], + points: &[FF], + ) -> Vec { + if points.len() < Self::FAST_COSET_EXTRAPOLATE_THRESHOLD { + Self::fast_coset_extrapolate(domain_offset, codeword, points) + } else { + Self::naive_coset_extrapolate(domain_offset, codeword, points) + } + } + + fn naive_coset_extrapolate_preprocessing(points: &[FF]) -> (ZerofierTree, Vec, usize) { + let zerofier_tree = ZerofierTree::new_from_domain(points); + let (shift_coefficients, tail_length) = + Self::shift_factor_ntt_with_tail_size(&zerofier_tree.zerofier()); + (zerofier_tree, shift_coefficients, tail_length) + } + + fn naive_coset_extrapolate( + domain_offset: BFieldElement, + codeword: &[FF], + points: &[FF], + ) -> Vec { + let n = codeword.len(); + let logn = n.ilog2(); + let mut coefficients = codeword.to_vec(); + intt( + &mut coefficients, + BFieldElement::primitive_root_of_unity(n as u64).unwrap(), + logn, + ); + let interpolant = + Polynomial::new(coefficients).scale(FF::from(domain_offset.inverse().value())); + interpolant.batch_evaluate(points) + } + + fn fast_coset_extrapolate( + domain_offset: BFieldElement, + codeword: &[FF], + points: &[FF], + ) -> Vec { + let zerofier_tree = ZerofierTree::new_from_domain(points); + let minimal_interpolant = Self::fast_modular_coset_interpolate( + codeword, + domain_offset, + &zerofier_tree.zerofier(), + ); + minimal_interpolant.divide_and_conquer_batch_evaluate(&zerofier_tree) + } + + /// Extrapolate many Reed-Solomon codewords, defined relative to the same + /// coset of the subgroup of order `codeword_length`, in the same set of + /// new points. + /// + /// # Example + /// ``` + /// # use twenty_first::prelude::*; + /// let n = 1 << 5; + /// let domain_offset = bfe!(7); + /// let codewords = [bfe_vec![3; n], bfe_vec![2; n]].concat(); + /// let points = bfe_vec![0, 1]; + /// assert_eq!( + /// bfe_vec![3, 3, 2, 2], + /// Polynomial::::batch_coset_extrapolate( + /// domain_offset, + /// n, + /// &codewords, + /// &points + /// ) + /// ); + /// ``` + /// + /// # Panics + /// Panics if the `codeword_length` is not a power of two. + pub fn batch_coset_extrapolate( + domain_offset: BFieldElement, + codeword_length: usize, + codewords: &[FF], + points: &[FF], + ) -> Vec { + if points.len() < Self::FAST_COSET_EXTRAPOLATE_THRESHOLD { + Self::batch_fast_coset_extrapolate(domain_offset, codeword_length, codewords, points) + } else { + Self::batch_naive_coset_extrapolate(domain_offset, codeword_length, codewords, points) + } + } + + fn batch_fast_coset_extrapolate( + domain_offset: BFieldElement, + codeword_length: usize, + codewords: &[FF], + points: &[FF], + ) -> Vec { + let n = codeword_length; + + let zerofier_tree = ZerofierTree::new_from_domain(points); + let modulus = zerofier_tree.zerofier(); + let preprocessing_data = Self::fast_modular_coset_interpolate_preprocess( + codeword_length, + domain_offset, + &modulus, + ); + + (0..codewords.len() / n) + .flat_map(|i| { + let codeword = &codewords[i * n..(i + 1) * n]; + let minimal_interpolant = + Self::fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple( + codeword, + domain_offset, + &modulus, + &preprocessing_data, + ); + minimal_interpolant.divide_and_conquer_batch_evaluate(&zerofier_tree) + }) + .collect() + } + + fn batch_naive_coset_extrapolate( + domain_offset: BFieldElement, + codeword_length: usize, + codewords: &[FF], + points: &[FF], + ) -> Vec { + let (zerofier_tree, shift_coefficients, tail_length) = + Self::naive_coset_extrapolate_preprocessing(points); + let n = codeword_length; + let logn = n.ilog2(); + + (0..codewords.len() / n) + .flat_map(|i| { + let mut coefficients = codewords[i * n..(i + 1) * n].to_vec(); + intt( + &mut coefficients, + BFieldElement::primitive_root_of_unity(n as u64).unwrap(), + logn, + ); + Polynomial::new(coefficients) + .scale(FF::from(domain_offset.inverse().value())) + .reduce_by_ntt_friendly_modulus(&shift_coefficients, tail_length) + .divide_and_conquer_batch_evaluate(&zerofier_tree) + }) + .collect() + } + + /// Parallel version of [`batch_coset_extrapolate`](Self::batch_coset_extrapolate). + pub fn par_batch_coset_extrapolate( + domain_offset: BFieldElement, + codeword_length: usize, + codewords: &[FF], + points: &[FF], + ) -> Vec { + if points.len() < Self::FAST_COSET_EXTRAPOLATE_THRESHOLD { + Self::par_batch_fast_coset_extrapolate( + domain_offset, + codeword_length, + codewords, + points, + ) + } else { + Self::par_batch_naive_coset_extrapolate( + domain_offset, + codeword_length, + codewords, + points, + ) + } + } + + fn par_batch_fast_coset_extrapolate( + domain_offset: BFieldElement, + codeword_length: usize, + codewords: &[FF], + points: &[FF], + ) -> Vec { + let n = codeword_length; + + let zerofier_tree = ZerofierTree::new_from_domain(points); + let modulus = zerofier_tree.zerofier(); + let preprocessing_data = Self::fast_modular_coset_interpolate_preprocess( + codeword_length, + domain_offset, + &modulus, + ); + + (0..codewords.len() / n) + .into_par_iter() + .flat_map(|i| { + let codeword = &codewords[i * n..(i + 1) * n]; + let minimal_interpolant = + Self::fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple( + codeword, + domain_offset, + &modulus, + &preprocessing_data, + ); + minimal_interpolant.divide_and_conquer_batch_evaluate(&zerofier_tree) + }) + .collect() + } + + fn par_batch_naive_coset_extrapolate( + domain_offset: BFieldElement, + codeword_length: usize, + codewords: &[FF], + points: &[FF], + ) -> Vec { + let (zerofier_tree, shift_coefficients, tail_length) = + Self::naive_coset_extrapolate_preprocessing(points); + let n = codeword_length; + let logn = n.ilog2(); + + (0..codewords.len() / n) + .into_par_iter() + .flat_map(|i| { + let mut coefficients = codewords[i * n..(i + 1) * n].to_vec(); + intt( + &mut coefficients, + BFieldElement::primitive_root_of_unity(n as u64).unwrap(), + logn, + ); + Polynomial::new(coefficients) + .scale(FF::from(domain_offset.inverse().value())) + .reduce_by_ntt_friendly_modulus(&shift_coefficients, tail_length) + .divide_and_conquer_batch_evaluate(&zerofier_tree) + }) + .collect() + } } impl Polynomial { @@ -2577,6 +3037,23 @@ mod test_polynomials { prop_assert_eq!(a * b, product); } + #[proptest] + fn batch_multiply_agrees_with_iterative_multiply(a: Vec>) { + let mut acc = Polynomial::one(); + for factor in &a { + acc = acc.multiply(factor); + } + prop_assert_eq!(acc, Polynomial::batch_multiply(&a)); + } + + #[proptest] + fn par_batch_multiply_agrees_with_batch_multiply(a: Vec>) { + prop_assert_eq!( + Polynomial::batch_multiply(&a), + Polynomial::par_batch_multiply(&a) + ); + } + #[proptest(cases = 50)] fn naive_zerofier_and_fast_zerofier_are_identical( #[any(size_range(..Polynomial::::FAST_ZEROFIER_CUTOFF_THRESHOLD * 2).lift())] @@ -2630,6 +3107,13 @@ mod test_polynomials { zerofier.leading_coefficient().unwrap() ); } + #[proptest] + fn par_zerofier_agrees_with_zerofier(domain: Vec) { + prop_assert_eq!( + Polynomial::zerofier(&domain), + Polynomial::par_zerofier(&domain) + ); + } #[test] fn fast_evaluate_on_hardcoded_domain_and_polynomial() { @@ -2651,16 +3135,6 @@ mod test_polynomials { prop_assert_eq!(evaluations, fast_evaluations); } - #[proptest] - fn slow_and_vector_batch_polynomial_evaluation_are_equivalent( - poly: Polynomial, - #[any(size_range(..1024).lift())] domain: Vec, - ) { - let evaluations = domain.iter().map(|&x| poly.evaluate(x)).collect_vec(); - let vector_batch_evaluations = poly.vector_batch_evaluate(&domain); - prop_assert_eq!(evaluations, vector_batch_evaluations); - } - #[test] #[should_panic(expected = "zero points")] fn interpolation_through_no_points_is_impossible() { @@ -2848,15 +3322,6 @@ mod test_polynomials { prop_assert_eq!(a, quot * b); } - #[proptest] - fn naive_division_and_fast_division_are_equivalent( - a: Polynomial, - #[filter(!#b.is_zero())] b: Polynomial, - ) { - let (quotient, _remainder) = a.fast_divide(&b); - prop_assert_eq!(a / b, quotient); - } - #[proptest] fn clean_division_agrees_with_divide_on_clean_division( #[strategy(arb())] a: Polynomial, @@ -3207,7 +3672,7 @@ mod test_polynomials { #[test] fn formal_power_series_inverse_newton_concrete() { - let f = Polynomial::::new(vec![ + let f = Polynomial::new(vec![ BFieldElement::new(3618372803227210457), BFieldElement::new(14620511201754172786), BFieldElement::new(2577803283145951105), @@ -3253,7 +3718,7 @@ mod test_polynomials { #[strategy(vec(arb(), 1..30))] coefficients: Vec, ) { - let polynomial = Polynomial::::new(coefficients); + let polynomial = Polynomial::new(coefficients); let multiple = polynomial.structured_multiple(); let (_quotient, remainder) = multiple.divide(&polynomial); prop_assert!(remainder.is_zero()); @@ -3265,7 +3730,7 @@ mod test_polynomials { #[strategy(vec(arb(), 1..30))] coefficients: Vec, ) { - let polynomial = Polynomial::::new(coefficients); + let polynomial = Polynomial::new(coefficients); let n = polynomial.degree() as usize; let structured_multiple = polynomial.structured_multiple(); assert!(structured_multiple.degree() as usize <= 2 * n); @@ -3280,17 +3745,17 @@ mod test_polynomials { let remainder = structured_multiple.reduce_long_division(&x2n); assert_eq!(remainder.degree() as usize, n - 1); assert_eq!( + 0, (structured_multiple.clone() - remainder.clone()) .reverse() .degree() as usize, - 0 ); assert_eq!( + BFieldElement::new(1), *(structured_multiple.clone() - remainder) .coefficients .last() .unwrap(), - BFieldElement::new(1) ); } @@ -3314,7 +3779,7 @@ mod test_polynomials { .concat(), ); let (_quotient, remainder) = structured_multiple.divide(&x2n); - assert_eq!(remainder.degree() as usize, n - 1); + assert_eq!(n - 1, remainder.degree() as usize); assert_eq!( (structured_multiple.clone() - remainder.clone()) .reverse() @@ -3337,7 +3802,7 @@ mod test_polynomials { #[strategy(vec(arb(), 1..usize::min(30, #n)))] coefficients: Vec, ) { - let polynomial = Polynomial::::new(coefficients); + let polynomial = Polynomial::new(coefficients); let multiple = polynomial.structured_multiple_of_degree(n); let remainder = multiple.reduce_long_division(&polynomial); prop_assert!(remainder.is_zero()); @@ -3349,7 +3814,7 @@ mod test_polynomials { #[strategy(vec(arb(), 3..usize::min(30, #n)))] mut coefficients: Vec, ) { *coefficients.last_mut().unwrap() = BFieldElement::new(1); - let polynomial = Polynomial::::new(coefficients); + let polynomial = Polynomial::new(coefficients); let structured_multiple = polynomial.structured_multiple_of_degree(n); let xn = Polynomial::new( @@ -3382,7 +3847,7 @@ mod test_polynomials { #[strategy(vec(arb(), 1..usize::min(30, #n)))] coefficients: Vec, ) { - let polynomial = Polynomial::::new(coefficients); + let polynomial = Polynomial::new(coefficients); let multiple = polynomial.structured_multiple_of_degree(n); prop_assert_eq!( multiple.degree() as usize, @@ -3397,8 +3862,7 @@ mod test_polynomials { fn reverse_polynomial_with_nonzero_constant_term_twice_gives_original_back( f: Polynomial, ) { - let fx_plus_1 = f.shift_coefficients(1) - + Polynomial::::from_constant(BFieldElement::new(1)); + let fx_plus_1 = f.shift_coefficients(1) + Polynomial::from_constant(bfe!(1)); prop_assert_eq!(fx_plus_1.clone(), fx_plus_1.reverse().reverse()); } @@ -3539,7 +4003,8 @@ mod test_polynomials { ) { let polynomial = Polynomial::new(coefficients); let evaluations_iterative = polynomial.iterative_batch_evaluate(&domain); - let evaluations_fast = polynomial.divide_and_conquer_batch_evaluate(&domain); + let zerofier_tree = ZerofierTree::new_from_domain(&domain); + let evaluations_fast = polynomial.divide_and_conquer_batch_evaluate(&zerofier_tree); let evaluations_reduce_then = polynomial.reduce_then_batch_evaluate(&domain); prop_assert_eq!(evaluations_iterative.clone(), evaluations_fast); @@ -3553,4 +4018,175 @@ mod test_polynomials { ) { prop_assert_eq!(a.divide(&b).1, a.reduce(&b)); } + + #[proptest] + fn structured_multiple_of_monomial_term_is_actually_multiple_and_of_right_degree( + #[strategy(1usize..1000)] degree: usize, + #[filter(!#leading_coefficient.is_zero())] leading_coefficient: BFieldElement, + #[strategy(#degree+1..#degree+200)] target_degree: usize, + ) { + let coefficients = [bfe_vec![0; degree], vec![leading_coefficient]].concat(); + let polynomial = Polynomial::new(coefficients); + let multiple = polynomial.structured_multiple_of_degree(target_degree); + prop_assert_eq!(Polynomial::zero(), multiple.reduce(&polynomial)); + prop_assert_eq!(multiple.degree() as usize, target_degree); + } + + #[proptest] + fn monomial_term_divided_by_smaller_monomial_term_gives_clean_division( + #[strategy(100usize..102)] high_degree: usize, + #[filter(!#high_lc.is_zero())] high_lc: BFieldElement, + #[strategy(83..#high_degree)] low_degree: usize, + #[filter(!#low_lc.is_zero())] low_lc: BFieldElement, + ) { + let numerator = Polynomial::new([bfe_vec![0; high_degree], vec![high_lc]].concat()); + let denominator = Polynomial::new([bfe_vec![0; low_degree], vec![low_lc]].concat()); + let (quotient, remainder) = numerator.divide(&denominator); + prop_assert_eq!( + quotient + .coefficients + .iter() + .filter(|c| !c.is_zero()) + .count(), + 1 + ); + prop_assert_eq!(Polynomial::zero(), remainder); + } + + #[proptest] + fn fast_modular_coset_interpolate_agrees_with_interpolate_then_reduce_property( + #[filter(!#modulus.is_zero())] modulus: Polynomial, + #[strategy(0usize..10)] _logn: usize, + #[strategy(Just(1 << #_logn))] n: usize, + #[strategy(vec(arb(), #n))] values: Vec, + #[strategy(arb())] offset: BFieldElement, + ) { + let omega = BFieldElement::primitive_root_of_unity(n as u64).unwrap(); + let domain = (0..n) + .scan(offset, |acc: &mut BFieldElement, _| { + let yld = *acc; + *acc *= omega; + Some(yld) + }) + .collect_vec(); + prop_assert_eq!( + Polynomial::fast_modular_coset_interpolate(&values, offset, &modulus), + Polynomial::interpolate(&domain, &values).reduce(&modulus) + ) + } + + #[test] + fn fast_modular_coset_interpolate_agrees_with_interpolate_then_reduce_concrete() { + let logn = 8; + let n = 1u64 << logn; + let modulus = Polynomial::new(bfe_vec![2, 3, 1]); + let values = (0..n).map(|i| BFieldElement::new(i / 5)).collect_vec(); + let offset = BFieldElement::new(7); + + let omega = BFieldElement::primitive_root_of_unity(n).unwrap(); + let mut domain = bfe_vec![0; n as usize]; + domain[0] = offset; + for i in 1..n as usize { + domain[i] = domain[i - 1] * omega; + } + assert_eq!( + Polynomial::interpolate(&domain, &values).reduce(&modulus), + Polynomial::fast_modular_coset_interpolate(&values, offset, &modulus), + ) + } + + #[proptest] + fn coset_extrapolation_methods_agree_with_interpolate_then_evaluate( + #[strategy(0usize..10)] _logn: usize, + #[strategy(Just(1 << #_logn))] n: usize, + #[strategy(vec(arb(), #n))] values: Vec, + #[strategy(arb())] offset: BFieldElement, + #[strategy(vec(arb(), 1..1000))] points: Vec, + ) { + let omega = BFieldElement::primitive_root_of_unity(n as u64).unwrap(); + let domain = (0..n) + .scan(offset, |acc: &mut BFieldElement, _| { + let yld = *acc; + *acc *= omega; + Some(yld) + }) + .collect_vec(); + let fast_coset_extrapolation = Polynomial::fast_coset_extrapolate(offset, &values, &points); + let naive_coset_extrapolation = + Polynomial::naive_coset_extrapolate(offset, &values, &points); + let interpolation_then_evaluation = + Polynomial::interpolate(&domain, &values).batch_evaluate(&points); + prop_assert_eq!(fast_coset_extrapolation.clone(), naive_coset_extrapolation); + prop_assert_eq!(fast_coset_extrapolation, interpolation_then_evaluation); + } + + #[proptest] + fn coset_extrapolate_and_batch_coset_extrapolate_agree( + #[strategy(1usize..10)] _logn: usize, + #[strategy(Just(1<<#_logn))] n: usize, + #[strategy(0usize..5)] _m: usize, + #[strategy(vec(arb(), #_m*#n))] codewords: Vec, + #[strategy(vec(arb(), 0..20))] points: Vec, + ) { + let offset = BFieldElement::new(7); + + let one_by_one_dispatch = codewords + .chunks(n) + .flat_map(|chunk| Polynomial::coset_extrapolate(offset, chunk, &points)) + .collect_vec(); + let batched_dispatch = Polynomial::batch_coset_extrapolate(offset, n, &codewords, &points); + let par_batched_dispatch = + Polynomial::par_batch_coset_extrapolate(offset, n, &codewords, &points); + prop_assert_eq!(one_by_one_dispatch.clone(), batched_dispatch); + prop_assert_eq!(one_by_one_dispatch, par_batched_dispatch); + + let one_by_one_fast = codewords + .chunks(n) + .flat_map(|chunk| Polynomial::fast_coset_extrapolate(offset, chunk, &points)) + .collect_vec(); + let batched_fast = Polynomial::batch_fast_coset_extrapolate(offset, n, &codewords, &points); + let par_batched_fast = + Polynomial::par_batch_fast_coset_extrapolate(offset, n, &codewords, &points); + prop_assert_eq!(one_by_one_fast.clone(), batched_fast); + prop_assert_eq!(one_by_one_fast, par_batched_fast); + + let one_by_one_naive = codewords + .chunks(n) + .flat_map(|chunk| Polynomial::naive_coset_extrapolate(offset, chunk, &points)) + .collect_vec(); + let batched_naive = + Polynomial::batch_naive_coset_extrapolate(offset, n, &codewords, &points); + let par_batched_naive = + Polynomial::par_batch_naive_coset_extrapolate(offset, n, &codewords, &points); + prop_assert_eq!(one_by_one_naive.clone(), batched_naive); + prop_assert_eq!(one_by_one_naive, par_batched_naive); + } + + #[test] + fn fast_modular_coset_interpolate_thresholds_relate_properly() { + let intt = Polynomial::::FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_INTT; + let lagrange = Polynomial::::FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_LAGRANGE; + assert!(intt > lagrange); + } + + #[proptest] + fn interpolate_and_par_interpolate_agree( + #[filter(#points.len() > 0)] points: Vec, + #[strategy(vec(arb(), #points.len()))] domain: Vec, + ) { + let expected_interpolant = Polynomial::interpolate(&domain, &points); + let observed_interpolant = Polynomial::par_interpolate(&domain, &points); + prop_assert_eq!(expected_interpolant, observed_interpolant); + } + + #[proptest] + fn batch_evaluate_agrees_with_par_batch_evalaute( + polynomial: Polynomial, + points: Vec, + ) { + prop_assert_eq!( + polynomial.batch_evaluate(&points), + polynomial.par_batch_evaluate(&points) + ); + } } diff --git a/twenty-first/src/math/zerofier_tree.rs b/twenty-first/src/math/zerofier_tree.rs new file mode 100644 index 000000000..7b7b7ffce --- /dev/null +++ b/twenty-first/src/math/zerofier_tree.rs @@ -0,0 +1,160 @@ +use std::collections::VecDeque; +use std::ops::MulAssign; + +use num_traits::One; + +use super::b_field_element::BFieldElement; +use super::polynomial::Polynomial; +use super::traits::FiniteField; + +#[derive(Debug, Clone, PartialEq)] +pub struct Leaf> { + pub(crate) points: Vec, + zerofier: Polynomial, +} + +impl Leaf +where + FF: FiniteField + MulAssign, +{ + pub fn new(points: Vec) -> Self { + let zerofier = Polynomial::zerofier(&points); + Self { points, zerofier } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct Branch> { + zerofier: Polynomial, + pub(crate) left: ZerofierTree, + pub(crate) right: ZerofierTree, +} + +impl Branch +where + FF: FiniteField + MulAssign, +{ + pub fn new(left: ZerofierTree, right: ZerofierTree) -> Self { + let zerofier = left.zerofier().multiply(&right.zerofier()); + Self { + zerofier, + left, + right, + } + } +} + +/// A zerofier tree is a balanced binary tree of vanishing polynomials. +/// Conceptually, every leaf corresponds to a single point, and the value of +/// that leaf is the monic linear polynomial that evaluates to zero there and +/// no-where else. Every non-leaf node is the product of its two children. +/// In practice, it makes sense to truncate the tree depth, in which case every +/// leaf contains a chunk of points whose size is upper-bounded and more or less +/// equal to some constant threshold. +#[derive(Debug, Clone, PartialEq)] +pub enum ZerofierTree> { + Leaf(Leaf), + Branch(Box>), + Padding, +} + +impl> ZerofierTree { + /// Regulates the depth at which the tree is truncated. Phrased differently, + /// regulates the number of points contained by each leaf. + const RECURSION_CUTOFF_THRESHOLD: usize = 16; + + pub fn new_from_domain(domain: &[FF]) -> Self { + let mut nodes = domain + .chunks(Self::RECURSION_CUTOFF_THRESHOLD) + .map(|chunk| { + let leaf = Leaf::new(chunk.to_vec()); + ZerofierTree::Leaf(leaf) + }) + .collect::>(); + nodes.resize(nodes.len().next_power_of_two(), ZerofierTree::Padding); + while nodes.len() > 1 { + let right = nodes.pop_back().unwrap(); + let left = nodes.pop_back().unwrap(); + if left == ZerofierTree::Padding { + nodes.push_front(ZerofierTree::Padding); + } else { + let new_node = Branch::new(left, right); + nodes.push_front(ZerofierTree::Branch(Box::new(new_node))); + } + } + nodes.pop_front().unwrap() + } + + pub fn zerofier(&self) -> Polynomial { + match self { + ZerofierTree::Leaf(leaf) => leaf.zerofier.clone(), + ZerofierTree::Branch(branch) => branch.zerofier.clone(), + ZerofierTree::Padding => Polynomial::::one(), + } + } +} + +#[cfg(test)] +mod test { + use num_traits::Zero; + use proptest::collection::vec; + use proptest::prop_assert_eq; + use proptest_arbitrary_interop::arb; + use test_strategy::proptest; + + use crate::math::zerofier_tree::ZerofierTree; + use crate::prelude::BFieldElement; + use crate::prelude::Polynomial; + + #[test] + fn zerofier_tree_can_be_empty() { + ZerofierTree::::new_from_domain(&[]); + } + #[proptest] + fn zerofier_tree_root_is_multiple_of_children( + #[strategy(vec(arb(), 2*ZerofierTree::::RECURSION_CUTOFF_THRESHOLD))] + points: Vec, + ) { + let zerofier_tree = ZerofierTree::new_from_domain(&points); + let ZerofierTree::Branch(ref branch) = &zerofier_tree else { + panic!("not enough leafs"); + }; + prop_assert_eq!( + Polynomial::zero(), + zerofier_tree.zerofier().reduce(&branch.left.zerofier()) + ); + prop_assert_eq!( + Polynomial::zero(), + zerofier_tree.zerofier().reduce(&branch.right.zerofier()) + ); + } + + #[proptest] + fn zerofier_tree_root_has_right_degree( + #[strategy(vec(arb(), 1..(1<<10)))] points: Vec, + ) { + let zerofier_tree = ZerofierTree::new_from_domain(&points); + prop_assert_eq!(points.len(), zerofier_tree.zerofier().degree() as usize); + } + + #[proptest] + fn zerofier_tree_root_zerofies( + #[strategy(vec(arb(), 1..(1<<10)))] points: Vec, + #[strategy(0usize..#points.len())] index: usize, + ) { + let zerofier_tree = ZerofierTree::new_from_domain(&points); + prop_assert_eq!( + BFieldElement::zero(), + zerofier_tree.zerofier().evaluate(points[index]) + ); + } + + #[proptest] + fn zerofier_tree_and_polynomial_agree_on_zerofiers( + #[strategy(vec(arb(), 1..(1<<10)))] points: Vec, + ) { + let zerofier_tree = ZerofierTree::new_from_domain(&points); + let polynomial_zerofier = Polynomial::zerofier(&points); + prop_assert_eq!(polynomial_zerofier, zerofier_tree.zerofier()); + } +}