diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 1e0f1e347..cf114d3f7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -40,6 +40,10 @@ jobs:
         run: cargo clippy --all-targets -- -D warnings
 
       - name: Run tests
+        # Temporary workaround for Windows path issue, see:
+        # https://github.com/nextest-rs/nextest/issues/1493#issuecomment-2106331574
+        env:
+          RUSTUP_WINDOWS_PATH_ADD_BIN: 1
         run: cargo nextest run --no-fail-fast --all-targets
 
         # doctests are special [^1] but this step does not incur a performance penalty [^2]
diff --git a/cliff.toml b/cliff.toml
index 35dc6c202..5322b0889 100644
--- a/cliff.toml
+++ b/cliff.toml
@@ -58,6 +58,7 @@ commit_parsers = [
   { body = ".*security",          group = "<!-- 70 --> 🔒️ Security" },
   { message = "^revert",          group = "<!-- 80 --> ⏪️ Revert" },
   { message = "^style",           group = "<!-- 90 --> 🎨 Styling" },
+  { message = "^support",         skip = true },
 ]
 
 protect_breaking_commits = false
diff --git a/twenty-first/Cargo.toml b/twenty-first/Cargo.toml
index 529ee9db4..930e95bef 100644
--- a/twenty-first/Cargo.toml
+++ b/twenty-first/Cargo.toml
@@ -67,15 +67,19 @@ name = "evaluation"
 harness = false
 
 [[bench]]
-name = "poly_mod_reduce"
+name = "extrapolation"
 harness = false
 
 [[bench]]
-name = "interpolation"
+name = "coset_extrapolation"
 harness = false
 
 [[bench]]
-name = "poly_div"
+name = "poly_mod_reduce"
+harness = false
+
+[[bench]]
+name = "interpolation"
 harness = false
 
 [[bench]]
diff --git a/twenty-first/benches/coset_extrapolation.rs b/twenty-first/benches/coset_extrapolation.rs
new file mode 100644
index 000000000..74f6fdd14
--- /dev/null
+++ b/twenty-first/benches/coset_extrapolation.rs
@@ -0,0 +1,37 @@
+use criterion::criterion_group;
+use criterion::criterion_main;
+use criterion::BenchmarkId;
+use criterion::Criterion;
+
+use twenty_first::math::other::random_elements;
+use twenty_first::prelude::*;
+
+criterion_main!(benches);
+criterion_group!(
+    name = benches;
+    config = Criterion::default().sample_size(10);
+    targets = coset_extrapolation<{ 1 << 18 }, { 1 << 8 }>,
+              coset_extrapolation<{ 1 << 19 }, { 1 << 8 }>,
+              coset_extrapolation<{ 1 << 20 }, { 1 << 8 }>,
+              coset_extrapolation<{ 1 << 21 }, { 1 << 8 }>,
+              coset_extrapolation<{ 1 << 22 }, { 1 << 8 }>,
+              coset_extrapolation<{ 1 << 23 }, { 1 << 8 }>,
+);
+
+fn coset_extrapolation<const SIZE: usize, const NUM_POINTS: usize>(c: &mut Criterion) {
+    let log2_of_size = SIZE.ilog2();
+    let mut group = c.benchmark_group(format!(
+        "Fast extrapolation of length-{SIZE} codeword in {NUM_POINTS} Points"
+    ));
+
+    let codeword = random_elements(SIZE);
+    let offset = BFieldElement::new(7);
+    let eval_points: Vec<BFieldElement> = random_elements(NUM_POINTS);
+
+    let id = BenchmarkId::new("Fast Codeword Extrapolation", log2_of_size);
+    group.bench_function(id, |b| {
+        b.iter(|| Polynomial::<BFieldElement>::coset_extrapolate(offset, &codeword, &eval_points))
+    });
+
+    group.finish();
+}
diff --git a/twenty-first/benches/evaluation.rs b/twenty-first/benches/evaluation.rs
index ae06c426c..6a6a107ce 100644
--- a/twenty-first/benches/evaluation.rs
+++ b/twenty-first/benches/evaluation.rs
@@ -4,6 +4,7 @@ use criterion::BenchmarkId;
 use criterion::Criterion;
 
 use twenty_first::math::other::random_elements;
+use twenty_first::math::zerofier_tree::ZerofierTree;
 use twenty_first::prelude::*;
 
 criterion_main!(benches);
@@ -36,15 +37,18 @@ fn evaluation<const SIZE: usize, const NUM_POINTS: usize>(c: &mut Criterion) {
         b.iter(|| poly.iterative_batch_evaluate(&eval_points))
     });
 
-    // `vector_batch_evaluate` exists, but is super slow. Put it here if you plan to run benchmarks
-    // during a coffee break.
+    let id = BenchmarkId::new("Zerofier Tree", log2_of_size);
+    group.bench_function(id, |b| {
+        b.iter(|| ZerofierTree::new_from_domain(&eval_points))
+    });
 
     let id = BenchmarkId::new("Divide-and-Conquer", log2_of_size);
+    let zerofier_tree = ZerofierTree::new_from_domain(&eval_points);
     group.bench_function(id, |b| {
-        b.iter(|| poly.divide_and_conquer_batch_evaluate(&eval_points))
+        b.iter(|| poly.divide_and_conquer_batch_evaluate(&zerofier_tree))
     });
 
-    let id = BenchmarkId::new("Dispatcher", log2_of_size);
+    let id = BenchmarkId::new("Entrypoint", log2_of_size);
     group.bench_function(id, |b| b.iter(|| poly.batch_evaluate(&eval_points)));
 
     group.finish();
diff --git a/twenty-first/benches/extrapolation.rs b/twenty-first/benches/extrapolation.rs
new file mode 100644
index 000000000..0577b2d02
--- /dev/null
+++ b/twenty-first/benches/extrapolation.rs
@@ -0,0 +1,95 @@
+use criterion::criterion_group;
+use criterion::criterion_main;
+use criterion::BenchmarkId;
+use criterion::Criterion;
+
+use twenty_first::math::ntt::intt;
+use twenty_first::math::other::random_elements;
+use twenty_first::math::traits::PrimitiveRootOfUnity;
+use twenty_first::math::zerofier_tree::ZerofierTree;
+use twenty_first::prelude::*;
+
+criterion_main!(benches);
+criterion_group!(
+    name = benches;
+    config = Criterion::default().sample_size(10);
+    targets = extrapolation<{ 1 << 18 }, { 1 << 6 }>,
+              extrapolation<{ 1 << 18 }, { 1 << 7 }>,
+              extrapolation<{ 1 << 18 }, { 1 << 8 }>,
+              extrapolation<{ 1 << 19 }, { 1 << 6 }>,
+              extrapolation<{ 1 << 19 }, { 1 << 7 }>,
+              extrapolation<{ 1 << 19 }, { 1 << 8 }>,
+              extrapolation<{ 1 << 20 }, { 1 << 6 }>,
+              extrapolation<{ 1 << 20 }, { 1 << 7 }>,
+              extrapolation<{ 1 << 20 }, { 1 << 8 }>,
+);
+
+fn intt_then_evaluate(
+    codeword: &[BFieldElement],
+    offset: BFieldElement,
+    zerofier_tree: &ZerofierTree<BFieldElement>,
+    shift_coefficients: &[BFieldElement],
+    tail_length: usize,
+) -> Vec<BFieldElement> {
+    let omega = BFieldElement::primitive_root_of_unity(codeword.len() as u64).unwrap();
+    let log_domain_length = codeword.len().ilog2();
+    let mut coefficients = codeword.to_vec();
+    intt(&mut coefficients, omega, log_domain_length);
+    let polynomial: Polynomial<BFieldElement> = Polynomial::new(coefficients)
+        .scale(offset.inverse())
+        .reduce_by_ntt_friendly_modulus(shift_coefficients, tail_length);
+    polynomial.divide_and_conquer_batch_evaluate(zerofier_tree)
+}
+
+fn extrapolation<const SIZE: usize, const NUM_POINTS: usize>(c: &mut Criterion) {
+    let log2_of_size = SIZE.ilog2();
+    let mut group = c.benchmark_group(format!(
+        "Extrapolation of length-{SIZE} codeword in {NUM_POINTS} Points"
+    ));
+
+    let codeword = random_elements(SIZE);
+    let offset = BFieldElement::new(7);
+    let eval_points: Vec<BFieldElement> = random_elements(NUM_POINTS);
+
+    let zerofier_tree = ZerofierTree::new_from_domain(&eval_points);
+    let modulus = zerofier_tree.zerofier();
+    let preprocessing_data =
+        Polynomial::fast_modular_coset_interpolate_preprocess(SIZE, offset, &modulus);
+
+    let id = BenchmarkId::new("INTT-then-Evaluate", log2_of_size);
+    group.bench_function(id, |b| {
+        b.iter(|| {
+            intt_then_evaluate(
+                &codeword,
+                offset,
+                &zerofier_tree,
+                &preprocessing_data.shift_coefficients,
+                preprocessing_data.tail_length,
+            )
+        })
+    });
+
+    // We used to have another benchmark here that used barycentric evaluation
+    // (from `fri.rs` in repo triton-vm) inside of a loop over all points. It
+    // was never close to faster.
+    let id = BenchmarkId::new("Fast Codeword Extrapolation", log2_of_size);
+    group.bench_function(id, |b| {
+        b.iter(|| {
+            let minimal_interpolant =
+        Polynomial::<BFieldElement>::fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple(
+            &codeword,
+                offset,
+                &modulus,
+                &preprocessing_data
+            );
+            minimal_interpolant.divide_and_conquer_batch_evaluate(&zerofier_tree)
+        })
+    });
+
+    let id = BenchmarkId::new("Dispatcher (includes preprocessing)", log2_of_size);
+    group.bench_function(id, |b| {
+        b.iter(|| Polynomial::coset_extrapolate(offset, &codeword, &eval_points))
+    });
+
+    group.finish();
+}
diff --git a/twenty-first/benches/poly_div.rs b/twenty-first/benches/poly_div.rs
deleted file mode 100644
index 8cf4193cc..000000000
--- a/twenty-first/benches/poly_div.rs
+++ /dev/null
@@ -1,44 +0,0 @@
-use criterion::criterion_group;
-use criterion::criterion_main;
-use criterion::BenchmarkId;
-use criterion::Criterion;
-
-use twenty_first::math::other::random_elements;
-use twenty_first::prelude::*;
-
-criterion_main!(benches);
-criterion_group!(
-    name = benches;
-    config = Criterion::default();
-    targets = poly_div<12,  6>,
-              poly_div<12,  8>,
-              poly_div<12, 12>,
-              poly_div<13,  6>,
-              poly_div<13,  8>,
-              poly_div<13, 12>,
-              poly_div<14,  6>,
-              poly_div<14,  8>,
-              poly_div<14, 12>,
-);
-
-fn poly_div<const LOG2_NUM_DEG: usize, const LOG2_DEN_DEG: usize>(c: &mut Criterion) {
-    let mut group = c.benchmark_group(format!(
-        "Division of Polynomials – \
-         Dividend Degree: 2^{LOG2_NUM_DEG}, Divisor Degree: 2^{LOG2_DEN_DEG}"
-    ));
-    let bench_param = || format!("{LOG2_NUM_DEG}|{LOG2_DEN_DEG}");
-
-    let num = Polynomial::<BFieldElement>::new(random_elements((1 << LOG2_NUM_DEG) + 1));
-    let den = Polynomial::<BFieldElement>::new(random_elements((1 << LOG2_DEN_DEG) + 1));
-
-    let id = BenchmarkId::new("Naïve", bench_param());
-    group.bench_function(id, |b| b.iter(|| num.naive_divide(&den)));
-
-    let id = BenchmarkId::new("Fast", bench_param());
-    group.bench_function(id, |b| b.iter(|| num.fast_divide(&den)));
-
-    let id = BenchmarkId::new("Faster of the two", bench_param());
-    group.bench_function(id, |b| b.iter(|| num.divide(&den)));
-
-    group.finish();
-}
diff --git a/twenty-first/benches/poly_mod_reduce.rs b/twenty-first/benches/poly_mod_reduce.rs
index d19498cb5..b7dc60e0c 100644
--- a/twenty-first/benches/poly_mod_reduce.rs
+++ b/twenty-first/benches/poly_mod_reduce.rs
@@ -30,12 +30,6 @@ fn poly_mod_reduce<const SIZE_LHS: usize, const SIZE_RHS: usize>(c: &mut Criteri
     let id = BenchmarkId::new("long division", log2_of_size);
     group.bench_function(id, |b| b.iter(|| lhs.clone() % rhs.clone()));
 
-    // despite its name, `.fast_divide()` is slow – ignore for big inputs
-    if SIZE_LHS < 1 << 13 && SIZE_RHS < 1 << 13 {
-        let id = BenchmarkId::new("fast division", log2_of_size);
-        group.bench_function(id, |b| b.iter(|| lhs.fast_divide(&rhs)));
-    }
-
     let id = BenchmarkId::new("fast reduce", log2_of_size);
     group.bench_function(id, |b| b.iter(|| lhs.fast_reduce(&rhs)));
 
diff --git a/twenty-first/benches/zerofier.rs b/twenty-first/benches/zerofier.rs
index 1d0243b7d..c7ce85a2d 100644
--- a/twenty-first/benches/zerofier.rs
+++ b/twenty-first/benches/zerofier.rs
@@ -35,7 +35,7 @@ fn zerofier<const SIZE: usize>(c: &mut Criterion) {
     let id = BenchmarkId::new("Fast", SIZE);
     group.bench_function(id, |b| b.iter(|| Polynomial::fast_zerofier(&roots)));
 
-    let id = BenchmarkId::new("Fastest of the three", SIZE);
+    let id = BenchmarkId::new("Dispatcher", SIZE);
     group.bench_function(id, |b| b.iter(|| Polynomial::zerofier(&roots)));
 
     group.finish();
diff --git a/twenty-first/src/lib.rs b/twenty-first/src/lib.rs
index 1d175ade0..1318124fa 100644
--- a/twenty-first/src/lib.rs
+++ b/twenty-first/src/lib.rs
@@ -67,6 +67,12 @@ pub(crate) mod tests {
         implements_usual_auto_traits::<mock::mmr::MockMmr<Tip5>>();
         implements_usual_auto_traits::<util_types::algebraic_hasher::Domain>();
         implements_usual_auto_traits::<util_types::mmr::mmr_accumulator::MmrAccumulator<Tip5>>();
+        implements_usual_auto_traits::<math::zerofier_tree::Branch<BFieldElement>>();
+        implements_usual_auto_traits::<math::zerofier_tree::Leaf<BFieldElement>>();
+        implements_usual_auto_traits::<math::zerofier_tree::ZerofierTree<BFieldElement>>();
+        implements_usual_auto_traits::<
+            math::polynomial::ModularInterpolationPreprocessingData<BFieldElement>,
+        >();
     }
 
     #[test]
diff --git a/twenty-first/src/math.rs b/twenty-first/src/math.rs
index 54323836e..b60f5403a 100644
--- a/twenty-first/src/math.rs
+++ b/twenty-first/src/math.rs
@@ -9,3 +9,4 @@ pub mod polynomial;
 pub mod tip5;
 pub mod traits;
 pub mod x_field_element;
+pub mod zerofier_tree;
diff --git a/twenty-first/src/math/b_field_element.rs b/twenty-first/src/math/b_field_element.rs
index 9b4e5fc2c..5f628eee6 100644
--- a/twenty-first/src/math/b_field_element.rs
+++ b/twenty-first/src/math/b_field_element.rs
@@ -215,6 +215,9 @@ impl BFieldElement {
     /// 2^128 mod P; this is used for conversion of elements into Montgomery representation.
     const R2: u64 = 0xFFFFFFFE00000001;
 
+    /// -2^-1
+    pub const MINUS_TWO_INVERSE: Self = Self::new(9223372034707292160);
+
     #[inline]
     pub const fn new(value: u64) -> Self {
         Self(Self::montyred((value as u128) * (Self::R2 as u128)))
@@ -1300,4 +1303,9 @@ mod b_prime_field_element_test {
     fn bfe_macro_produces_same_result_as_calling_new(value: u64) {
         prop_assert_eq!(BFieldElement::new(value), bfe!(value));
     }
+
+    #[test]
+    fn const_minus_two_inverse_is_really_minus_two_inverse() {
+        assert_eq!(bfe!(-2).inverse(), BFieldElement::MINUS_TWO_INVERSE);
+    }
 }
diff --git a/twenty-first/src/math/polynomial.rs b/twenty-first/src/math/polynomial.rs
index ae0fe3a92..cabd4b586 100644
--- a/twenty-first/src/math/polynomial.rs
+++ b/twenty-first/src/math/polynomial.rs
@@ -30,6 +30,7 @@ use crate::prelude::Inverse;
 use crate::prelude::XFieldElement;
 
 use super::traits::PrimitiveRootOfUnity;
+use super::zerofier_tree::ZerofierTree;
 
 impl<FF: FiniteField> Zero for Polynomial<FF> {
     fn zero() -> Self {
@@ -55,6 +56,17 @@ impl<FF: FiniteField> One for Polynomial<FF> {
     }
 }
 
+/// Data produced by the preprocessing phase of a batch modular interpolation.
+/// Marked `pub` for benchmarking purposes. Not part of the public API.
+#[doc(hidden)]
+#[derive(Debug, Clone)]
+pub struct ModularInterpolationPreprocessingData<FF: FiniteField> {
+    pub even_zerofiers: Vec<Polynomial<FF>>,
+    pub odd_zerofiers: Vec<Polynomial<FF>>,
+    pub shift_coefficients: Vec<FF>,
+    pub tail_length: usize,
+}
+
 /// A univariate polynomial with coefficients in a [finite field](FiniteField), in monomial form.
 #[derive(Clone, Arbitrary)]
 pub struct Polynomial<FF: FiniteField> {
@@ -134,21 +146,16 @@ where
     /// Extracted from `cargo bench --bench poly_mul` on mjolnir.
     const FAST_MULTIPLY_CUTOFF_THRESHOLD: isize = 1 << 8;
 
-    /// [Divide and conquer batch evaluation](Self::divide_and_conquer_batch_evaluate)
-    /// is slower than [iterative evaluation](Self::iterative_batch_evaluate)
-    /// on domains smaller than this threshold
-    const DIVIDE_AND_CONQUER_EVALUATE_CUTOFF_THRESHOLD: usize = 1 << 5;
-
     /// Computing the [fast zerofier][fast] is slower than computing the [smart zerofier][smart] for
     /// domain sizes smaller than this threshold. The [naïve zerofier][naive] is always slower to
     /// compute than the [smart zerofier][smart] for domain sizes smaller than the threshold.
     ///
-    /// Extracted from `cargo bench --bench zerofier` on mjolnir.
+    /// Extracted from `cargo bench --bench zerofier`.
     ///
     /// [naive]: Self::naive_zerofier
     /// [smart]: Self::smart_zerofier
     /// [fast]: Self::fast_zerofier
-    const FAST_ZEROFIER_CUTOFF_THRESHOLD: usize = 200;
+    const FAST_ZEROFIER_CUTOFF_THRESHOLD: usize = 100;
 
     /// [Fast interpolation](Self::fast_interpolate) is slower than
     /// [Lagrange interpolation](Self::lagrange_interpolate) below this threshold.
@@ -156,6 +163,22 @@ where
     /// Extracted from `cargo bench --bench interpolation` on mjolnir.
     const FAST_INTERPOLATE_CUTOFF_THRESHOLD: usize = 1 << 8;
 
+    /// Regulates the recursion depth at which
+    /// [Fast modular coset interpolation](Self::fast_modular_coset_interpolate)
+    /// is slower and switches to
+    /// [Lagrange interpolation](Self::lagrange_interpolate).
+    const FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_LAGRANGE: usize = 1 << 8;
+
+    /// Regulates the recursion depth at which
+    /// [Fast modular coset interpolation](Self::fast_modular_coset_interpolate)
+    /// is slower and switches to [INTT](ntt::intt)-then-[reduce](Self::reduce).
+    const FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_INTT: usize = 1 << 17;
+
+    /// Regulates when to prefer the [Fast coset extrapolation](Self::fast_coset_extrapolate)
+    /// over the [naïve method](Self::naive_coset_extrapolate). Threshold found
+    /// using `cargo criterion --bench extrapolation`.
+    const FAST_COSET_EXTRAPOLATE_THRESHOLD: usize = 100;
+
     /// Inside `formal_power_series_inverse`, when to multiply naively and when
     /// to use NTT-based multiplication. Use benchmark
     /// `formal_power_series_inverse` to find the optimum. Based on benchmarks,
@@ -175,9 +198,6 @@ where
     /// when.
     const REDUCE_BEFORE_EVALUATE_THRESHOLD_RATIO: isize = 4;
 
-    /// Regulates when to use long division and when to use NTT-based divide.
-    const FAST_DIVIDE_THRESHOLD: isize = 1000;
-
     /// Return the polynomial which corresponds to the transformation `x → α·x`.
     ///
     /// Given a polynomial P(x), produce P'(x) := P(α·x). Evaluating P'(x) then corresponds to
@@ -347,6 +367,51 @@ where
         Self::new(hadamard_product)
     }
 
+    /// Multiply a bunch of polynomials together.
+    pub fn batch_multiply(factors: &[Self]) -> Self {
+        // Build a tree-like structure of multiplications to keep the degrees of the
+        // factors roughly equal throughout the process. This makes efficient use of
+        // the `.multiply()` dispatcher.
+        // In contrast, using a simple `.reduce()`, the accumulator polynomial would
+        // have a much higher degree than the individual factors.
+        // todo: benchmark the current approach against the “reduce” approach.
+
+        if factors.is_empty() {
+            return Polynomial::one();
+        }
+        let mut products = factors.to_vec();
+        while products.len() != 1 {
+            products = products
+                .chunks(2)
+                .map(|chunk| match chunk.len() {
+                    2 => chunk[0].multiply(&chunk[1]),
+                    1 => chunk[0].clone(),
+                    _ => unreachable!(),
+                })
+                .collect();
+        }
+        products.pop().unwrap()
+    }
+
+    /// Parallel version of [`batch_multiply`](Self::batch_multiply).
+    pub fn par_batch_multiply(factors: &[Self]) -> Self {
+        if factors.is_empty() {
+            return Polynomial::one();
+        }
+        let num_threads = available_parallelism()
+            .map(|non_zero_usize| non_zero_usize.get())
+            .unwrap_or(1);
+        let mut products = factors.to_vec();
+        while products.len() != 1 {
+            let chunk_size = usize::max(2, products.len() / num_threads);
+            products = products
+                .par_chunks(chunk_size)
+                .map(Self::batch_multiply)
+                .collect();
+        }
+        products.pop().unwrap()
+    }
+
     /// Compute the lowest degree polynomial with the provided roots.
     /// Also known as “vanishing polynomial.”
     ///
@@ -372,6 +437,25 @@ where
         }
     }
 
+    /// Parallel version of [`zerofier`](Self::zerofier).
+    pub fn par_zerofier(roots: &[FF]) -> Self {
+        if roots.is_empty() {
+            return Self::one();
+        }
+        let num_threads = available_parallelism()
+            .map(|non_zero_usize| non_zero_usize.get())
+            .unwrap_or(1);
+        let chunk_size = roots
+            .len()
+            .div_ceil(num_threads)
+            .max(Self::FAST_ZEROFIER_CUTOFF_THRESHOLD);
+        let factors = roots
+            .par_chunks(chunk_size)
+            .map(|chunk| Self::zerofier(chunk))
+            .collect::<Vec<_>>();
+        Self::par_batch_multiply(&factors)
+    }
+
     /// Only `pub` to allow benchmarking; not considered part of the public API.
     #[doc(hidden)]
     pub fn smart_zerofier(roots: &[FF]) -> Self {
@@ -392,10 +476,8 @@ where
     #[doc(hidden)]
     pub fn fast_zerofier(roots: &[FF]) -> Self {
         let mid_point = roots.len() / 2;
-        let (left, right) = rayon::join(
-            || Self::zerofier(&roots[..mid_point]),
-            || Self::zerofier(&roots[mid_point..]),
-        );
+        let left = Self::zerofier(&roots[..mid_point]);
+        let right = Self::zerofier(&roots[mid_point..]);
 
         left.multiply(&right)
     }
@@ -756,25 +838,34 @@ where
 
     /// Evaluate the polynomial on a batch of points.
     pub fn batch_evaluate(&self, domain: &[FF]) -> Vec<FF> {
-        if self.degree() >= Self::REDUCE_BEFORE_EVALUATE_THRESHOLD_RATIO * (domain.len() as isize) {
+        if self.is_zero() {
+            vec![FF::zero(); domain.len()]
+        } else if self.degree()
+            >= Self::REDUCE_BEFORE_EVALUATE_THRESHOLD_RATIO * (domain.len() as isize)
+        {
             self.reduce_then_batch_evaluate(domain)
-        } else if domain.len() >= Self::DIVIDE_AND_CONQUER_EVALUATE_CUTOFF_THRESHOLD {
-            self.divide_and_conquer_batch_evaluate(domain)
         } else {
-            self.iterative_batch_evaluate(domain)
+            let zerofier_tree = ZerofierTree::new_from_domain(domain);
+            self.divide_and_conquer_batch_evaluate(&zerofier_tree)
         }
     }
 
     fn reduce_then_batch_evaluate(&self, domain: &[FF]) -> Vec<FF> {
-        let zerofier = Self::zerofier(domain);
+        let zerofier_tree = ZerofierTree::new_from_domain(domain);
+        let zerofier = zerofier_tree.zerofier();
         let remainder = self.fast_reduce(&zerofier);
-        remainder.batch_evaluate(domain)
+        remainder.divide_and_conquer_batch_evaluate(&zerofier_tree)
     }
 
     /// Parallel version of [`batch_evaluate`](Self::batch_evaluate).
     pub fn par_batch_evaluate(&self, domain: &[FF]) -> Vec<FF> {
-        let num_threads = available_parallelism().map(|nzu| nzu.get()).unwrap_or(1);
-        let chunk_size = domain.len().next_multiple_of(num_threads) / num_threads;
+        if domain.is_empty() || self.is_zero() {
+            return vec![FF::zero(); domain.len()];
+        }
+        let num_threads = available_parallelism()
+            .map(|non_zero_usize| non_zero_usize.get())
+            .unwrap_or(1);
+        let chunk_size = domain.len().div_ceil(num_threads);
         domain
             .par_chunks(chunk_size)
             .flat_map(|ch| self.batch_evaluate(ch))
@@ -787,37 +878,20 @@ where
         domain.iter().map(|&p| self.evaluate(p)).collect()
     }
 
-    /// Only marked `pub` for benchmarking; not considered part of the public API.
-    #[doc(hidden)]
-    pub fn par_evaluate(&self, domain: &[FF]) -> Vec<FF> {
-        domain.par_iter().map(|&p| self.evaluate(p)).collect()
-    }
-
     /// Only `pub` to allow benchmarking; not considered part of the public API.
     #[doc(hidden)]
-    pub fn vector_batch_evaluate(&self, domain: &[FF]) -> Vec<FF> {
-        let mut accumulators = vec![FF::zero(); domain.len()];
-        for &c in self.coefficients.iter().rev() {
-            accumulators
-                .par_iter_mut()
-                .zip(domain)
-                .for_each(|(acc, &x)| *acc = *acc * x + c);
+    pub fn divide_and_conquer_batch_evaluate(&self, zerofier_tree: &ZerofierTree<FF>) -> Vec<FF> {
+        match zerofier_tree {
+            ZerofierTree::Leaf(leaf) => self
+                .reduce(&zerofier_tree.zerofier())
+                .iterative_batch_evaluate(&leaf.points),
+            ZerofierTree::Branch(branch) => [
+                self.divide_and_conquer_batch_evaluate(&branch.left),
+                self.divide_and_conquer_batch_evaluate(&branch.right),
+            ]
+            .concat(),
+            ZerofierTree::Padding => vec![],
         }
-
-        accumulators
-    }
-
-    /// Only `pub` to allow benchmarking; not considered part of the public API.
-    #[doc(hidden)]
-    pub fn divide_and_conquer_batch_evaluate(&self, domain: &[FF]) -> Vec<FF> {
-        let mid_point = domain.len() / 2;
-        let left_half = &domain[..mid_point];
-        let right_half = &domain[mid_point..];
-
-        [left_half, right_half]
-            .into_iter()
-            .flat_map(|half_domain| self.batch_evaluate(half_domain))
-            .collect()
     }
 
     /// Fast evaluate on a coset domain, which is the group generated by `generator^i * offset`.
@@ -890,11 +964,9 @@ where
     ///
     /// Panics if the `divisor` is zero.
     pub fn divide(&self, divisor: &Self) -> (Self, Self) {
-        if self.degree() > Self::FAST_DIVIDE_THRESHOLD {
-            self.fast_divide(divisor)
-        } else {
-            self.naive_divide(divisor)
-        }
+        // There is an NTT-based division algorithm, but for no practical
+        // parameter set is it faster than long division.
+        self.naive_divide(divisor)
     }
 
     /// Compute a polynomial g(X) from a given polynomial f(X) such that
@@ -1095,7 +1167,16 @@ where
         let reverse = self.reverse();
         let inverse_reverse = reverse.formal_power_series_inverse_minimal(n as usize);
         let product_reverse = reverse.multiply(&inverse_reverse);
-        product_reverse.reverse()
+
+        // Correct for lost trailing coefficients, if any
+        let product = product_reverse.reverse();
+        let product_degree = product.degree();
+        let shift = if product_degree < n {
+            n - product_degree
+        } else {
+            0
+        };
+        product.shift_coefficients(shift as usize)
     }
 
     /// Given a polynomial f(X) and an integer n, find a multiple of f(X) of the
@@ -1114,7 +1195,9 @@ where
                 [vec![FF::from(0); n], vec![self.coefficients[0].inverse()]].concat(),
             );
         }
+
         let reverse = self.reverse();
+
         // The next function gives back a polynomial g(X) of degree at most arg,
         // such that f(X) * g(X) = 1 mod X^arg.
         // Without modular reduction, the degree of the product f(X) * g(X) is
@@ -1122,7 +1205,11 @@ where
         // and arg = n - deg(f).
         let inverse_reverse = reverse.formal_power_series_inverse_minimal(n - degree);
         let product_reverse = reverse.multiply(&inverse_reverse);
-        product_reverse.reverse()
+        let product = product_reverse.reverse();
+
+        // Coefficient reversal drops trailing zero. Correct for that.
+        let product_degree = product.degree() as usize;
+        product.shift_coefficients(n - product_degree)
     }
 
     /// Reduces f(X) by a structured modulus, which is of the form
@@ -1130,16 +1217,15 @@ where
     /// form, polynomial modular reductions can be computed faster than in the
     /// generic case.
     fn reduce_by_structured_modulus(&self, multiple: &Self) -> Self {
-        let leading_term = Polynomial::new(
-            [
-                vec![FF::from(0); multiple.degree() as usize],
-                vec![FF::from(1); 1],
-            ]
-            .concat(),
-        );
+        assert_ne!(multiple.degree(), 0);
+        let multiple_degree = usize::try_from(multiple.degree()).expect("cannot reduce by zero");
+        let leading_term =
+            Polynomial::new([vec![FF::from(0); multiple_degree], vec![FF::from(1); 1]].concat());
         let shift_polynomial = multiple.clone() - leading_term.clone();
-        let m = shift_polynomial.degree() as usize + 1;
-        let n = multiple.degree() as usize - m;
+        let m = usize::try_from(shift_polynomial.degree())
+            .map(|unsigned_degree| unsigned_degree + 1)
+            .unwrap_or(0);
+        let n = multiple_degree - m;
         if self.coefficients.len() < n + m {
             return self.clone();
         }
@@ -1177,28 +1263,37 @@ where
     ///
     /// This method uses NTT-based multiplication, meaning that the unstructured
     /// part of the structured multiple must be given in NTT-domain.
-    fn reduce_by_ntt_friendly_modulus(&self, shift_ntt: &[FF], m: usize) -> Self {
+    ///
+    /// This function is marked `pub` for benchmarking. Not considered part of
+    /// the public API
+    #[doc(hidden)]
+    pub fn reduce_by_ntt_friendly_modulus(&self, shift_ntt: &[FF], tail_length: usize) -> Self {
         let domain_length = shift_ntt.len();
         assert!(domain_length.is_power_of_two());
         let log_domain_length = domain_length.ilog2();
         let omega = BFieldElement::primitive_root_of_unity(domain_length as u64).unwrap();
-        let n = domain_length - m;
+        let chunk_size = domain_length - tail_length;
 
-        if self.coefficients.len() < n + m {
+        if self.coefficients.len() < chunk_size + tail_length {
             return self.clone();
         }
-        let num_reducible_chunks = (self.coefficients.len() - (m + n) + n - 1) / n;
+        let num_reducible_chunks =
+            (self.coefficients.len() - (tail_length + chunk_size)).div_ceil(chunk_size);
 
-        let range_start = num_reducible_chunks * n;
+        let range_start = num_reducible_chunks * chunk_size;
         let mut working_window = if range_start >= self.coefficients.len() {
-            vec![FF::from(0); n + m]
+            vec![FF::from(0); chunk_size + tail_length]
         } else {
             self.coefficients[range_start..].to_vec()
         };
-        working_window.resize(n + m, FF::from(0));
+        working_window.resize(chunk_size + tail_length, FF::from(0));
 
         for chunk_index in (0..num_reducible_chunks).rev() {
-            let mut product = [working_window[m..].to_vec(), vec![FF::from(0); m]].concat();
+            let mut product = [
+                working_window[tail_length..].to_vec(),
+                vec![FF::from(0); tail_length],
+            ]
+            .concat();
             ntt(&mut product, omega, log_domain_length);
             product
                 .iter_mut()
@@ -1206,12 +1301,20 @@ where
                 .for_each(|(l, r)| *l *= *r);
             intt(&mut product, omega, log_domain_length);
 
-            working_window = [vec![FF::from(0); n], working_window[0..m].to_vec()].concat();
-            for (i, wwi) in working_window.iter_mut().enumerate().take(n) {
-                *wwi = self.coefficients[chunk_index * n + i];
+            working_window = [
+                vec![FF::from(0); chunk_size],
+                working_window[0..tail_length].to_vec(),
+            ]
+            .concat();
+            for (i, wwi) in working_window.iter_mut().enumerate().take(chunk_size) {
+                *wwi = self.coefficients[chunk_index * chunk_size + i];
             }
 
-            for (i, wwi) in working_window.iter_mut().enumerate().take(n + m) {
+            for (i, wwi) in working_window
+                .iter_mut()
+                .enumerate()
+                .take(chunk_size + tail_length)
+            {
                 *wwi -= product[i];
             }
         }
@@ -1219,6 +1322,34 @@ where
         Polynomial::new(working_window)
     }
 
+    /// Only marked `pub` for benchmarking purposes. Not considered part of the
+    /// public API.
+    #[doc(hidden)]
+    pub fn shift_factor_ntt_with_tail_size(&self) -> (Vec<FF>, usize) {
+        let n = usize::max(
+            Polynomial::<FF>::FAST_REDUCE_CUTOFF_THRESHOLD,
+            self.degree() as usize * 2,
+        )
+        .next_power_of_two();
+        let ntt_friendly_multiple = self.structured_multiple_of_degree(n);
+        // m = 1 + degree(ntt_friendly_multiple - leading term)
+        let m = 1 + ntt_friendly_multiple
+            .coefficients
+            .iter()
+            .enumerate()
+            .rev()
+            .skip(1)
+            .find_map(|(i, c)| if !c.is_zero() { Some(i) } else { None })
+            .unwrap_or(0);
+        let mut shift_factor_ntt = ntt_friendly_multiple.coefficients[..n].to_vec();
+        ntt(
+            &mut shift_factor_ntt,
+            BFieldElement::primitive_root_of_unity(n as u64).unwrap(),
+            n.ilog2(),
+        );
+        (shift_factor_ntt, m)
+    }
+
     /// Compute the remainder after division of one polynomial by another. This
     /// method first reduces the numerator by a multiple of the denominator that
     /// was constructed to enable NTT-based chunk-wise reduction, before
@@ -1242,28 +1373,10 @@ where
         //               |- n=2^k coefficients.
         // This allows us to reduce the numerator's coefficients in chunks of
         // n-m using NTT-based multiplication over a domain of size n = 2^k.
-        let n = usize::max(
-            Polynomial::<FF>::FAST_REDUCE_CUTOFF_THRESHOLD,
-            modulus.degree() as usize * 2,
-        )
-        .next_power_of_two();
-        let ntt_friendly_multiple = modulus.structured_multiple_of_degree(n);
-        // m = 1 + degree(ntt_friendly_multiple - leading term)
-        let m = 1 + ntt_friendly_multiple
-            .coefficients
-            .iter()
-            .enumerate()
-            .rev()
-            .skip(1)
-            .find_map(|(i, c)| if !c.is_zero() { Some(i) } else { None })
-            .unwrap_or(0);
-        let mut shift_factor_ntt = ntt_friendly_multiple.coefficients[..n].to_vec();
-        ntt(
-            &mut shift_factor_ntt,
-            BFieldElement::primitive_root_of_unity(n as u64).unwrap(),
-            n.ilog2(),
-        );
-        let mut intermediate_remainder = self.reduce_by_ntt_friendly_modulus(&shift_factor_ntt, m);
+
+        let (shift_factor_ntt, tail_size) = modulus.shift_factor_ntt_with_tail_size();
+        let mut intermediate_remainder =
+            self.reduce_by_ntt_friendly_modulus(&shift_factor_ntt, tail_size);
 
         // 2. Chunk-wise reduction with schoolbook multiplication.
         // We generate a smaller structured multiple of the denominator that
@@ -1281,50 +1394,6 @@ where
         intermediate_remainder.reduce_long_division(modulus)
     }
 
-    /// Polynomial long division with `self` as the dividend, divided by some `divisor`.
-    /// Only `pub` to allow benchmarking; not considered part of the public API.
-    ///
-    /// As the name implies, the advantage of this method over [`divide`](Self::naive_divide) is
-    /// runtime complexity. Concretely, this method has time complexity in O(n·log(n)), whereas
-    /// [`divide`](Self::naive_divide) has time complexity in O(n^2).
-    #[doc(hidden)]
-    pub fn fast_divide(&self, divisor: &Self) -> (Self, Self) {
-        // The math for this function: [0]. There are very slight deviations, for example around the
-        // requirement that the divisor is monic.
-        //
-        // [0] https://cs.uwaterloo.ca/~r5olivei/courses/2021-winter-cs487/lecture5-post.pdf
-
-        let Ok(quotient_degree) = usize::try_from(self.degree() - divisor.degree()) else {
-            return (Self::zero(), self.clone());
-        };
-
-        if divisor.degree() == 0 {
-            let quotient = self.scalar_mul(divisor.leading_coefficient().unwrap().inverse());
-            let remainder = Polynomial::zero();
-            return (quotient, remainder);
-        }
-
-        // Reverse coefficient vectors to move into formal power series ring over FF, i.e., FF[[x]].
-        // Re-interpret as a polynomial to benefit from the already-implemented multiplication
-        // method, which mechanically work the same in FF[X] and FF[[x]].
-        let reverse = |poly: &Self| Self::new(poly.coefficients.iter().copied().rev().collect());
-
-        // Newton iteration to invert divisor up to required precision. Why is this the required
-        // precision? Good question.
-        let precision = (quotient_degree + 1).next_power_of_two();
-
-        let rev_divisor = reverse(divisor);
-        let rev_divisor_inverse = rev_divisor.formal_power_series_inverse_newton(precision);
-
-        let self_reverse = reverse(self);
-        let rev_quotient = self_reverse.multiply(&rev_divisor_inverse);
-
-        let quotient = reverse(&rev_quotient).truncate(quotient_degree);
-
-        let remainder = self.clone() - quotient.multiply(divisor);
-        (quotient, remainder)
-    }
-
     /// The degree-`k` polynomial with the same `k + 1` leading coefficients as `self`. To be more
     /// precise: The degree of the result will be the minimum of `k` and [`Self::degree()`]. This
     /// implies, among other things, that if `self` [is zero](Self::is_zero()), the result will also
@@ -1360,6 +1429,397 @@ where
         let num_coefficients_to_retain = n.min(self.coefficients.len());
         Self::new(self.coefficients[..num_coefficients_to_retain].into())
     }
+
+    /// Preprocessing data for
+    /// [fast modular coset interpolation](Self::fast_modular_coset_interpolate).
+    /// Marked `pub` for benchmarking. Not considered part of the public API.
+    #[doc(hidden)]
+    pub fn fast_modular_coset_interpolate_preprocess(
+        n: usize,
+        offset: BFieldElement,
+        modulus: &Polynomial<FF>,
+    ) -> ModularInterpolationPreprocessingData<FF> {
+        let omega = BFieldElement::primitive_root_of_unity(n as u64).unwrap();
+        // a list of polynomials whose ith element is X^(2^i) mod m(X)
+        let modular_squares = (0..n.ilog2())
+            .scan(
+                Polynomial::<FF>::new(vec![FF::from(0), FF::from(1)]),
+                |acc, _| {
+                    let yld = acc.clone();
+                    *acc = acc.multiply(acc).reduce(modulus);
+                    Some(yld)
+                },
+            )
+            .collect_vec();
+        let even_zerofiers = (0..n.ilog2())
+            .map(|i| offset.inverse().mod_pow(1u64 << i))
+            .zip(modular_squares.iter())
+            .map(|(lc, sq)| sq.scalar_mul(FF::from(lc.value())) - Polynomial::one())
+            .collect_vec();
+        let odd_zerofiers = (0..n.ilog2())
+            .map(|i| (offset * omega).inverse().mod_pow(1u64 << i))
+            .zip(modular_squares.iter())
+            .map(|(lc, sq)| sq.scalar_mul(FF::from(lc.value())) - Polynomial::one())
+            .collect_vec();
+
+        // precompute NTT-friendly multiple of the modulus
+        let (shift_coefficients, tail_length) = modulus.shift_factor_ntt_with_tail_size();
+
+        ModularInterpolationPreprocessingData {
+            even_zerofiers,
+            odd_zerofiers,
+            shift_coefficients,
+            tail_length,
+        }
+    }
+
+    /// Compute f(X) mod m(X) where m(X) is a given modulus and f(X) is the
+    /// interpolant of a list of n values on a domain which is a coset of the
+    /// size-n subgroup that is identified by some offset.
+    fn fast_modular_coset_interpolate(
+        values: &[FF],
+        offset: BFieldElement,
+        modulus: &Polynomial<FF>,
+    ) -> Self {
+        let preprocessing_data =
+            Self::fast_modular_coset_interpolate_preprocess(values.len(), offset, modulus);
+        Self::fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple(
+            values,
+            offset,
+            modulus,
+            &preprocessing_data,
+        )
+    }
+
+    /// Only marked `pub` for benchmarking purposes. Not considered part of the
+    /// interface.
+    #[doc(hidden)]
+    pub fn fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple(
+        values: &[FF],
+        offset: BFieldElement,
+        modulus: &Polynomial<FF>,
+        preprocessed: &ModularInterpolationPreprocessingData<FF>,
+    ) -> Self {
+        if modulus.is_zero() {
+            panic!("cannot reduce modulo zero")
+        };
+        let n = values.len();
+        let omega = BFieldElement::primitive_root_of_unity(n as u64).unwrap();
+
+        if n < Self::FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_LAGRANGE {
+            let domain = (0..n)
+                .scan(FF::from(offset.value()), |acc: &mut FF, _| {
+                    let yld = *acc;
+                    *acc *= omega;
+                    Some(yld)
+                })
+                .collect::<Vec<FF>>();
+            let interpolant = Self::lagrange_interpolate(&domain, values);
+            return interpolant.reduce(modulus);
+        } else if n <= Self::FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_INTT {
+            let mut coefficients = values.to_vec();
+            intt(&mut coefficients, omega, n.ilog2());
+            let interpolant = Polynomial::new(coefficients);
+
+            return interpolant
+                .scale(FF::from(offset.inverse().value()))
+                .reduce_by_ntt_friendly_modulus(
+                    &preprocessed.shift_coefficients,
+                    preprocessed.tail_length,
+                )
+                .reduce(modulus);
+        }
+
+        // Use even-odd domain split.
+        // Even: {offset * omega^{2*i} | i in 0..n/2}
+        // Odd: {offset * omega^{2*i+1} | i in 0..n/2}
+        //      = {(offset * omega) * omega^{2*i} | i in 0..n/2}
+        // But we don't actually need to represent the domains explicitly.
+
+        // 1. Get zerofiers.
+        // The zerofiers are sparse because the domain is structured.
+        // Even: (offset^-1 * X)^(n/2) - 1 = offset^{-n/2} * X^{n/2} - 1
+        // Odd: ((offset * omega)^-1 * X)^(n/2) - 1
+        //      = offset^{-n/2} * omega^{n/2} * X^{n/2} - 1
+        // Note that we are getting the (modularly reduced) zerofiers as
+        // function arguments.
+
+        // 2. Evaluate zerofiers on opposite domains.
+        // Actually, the values are compressible because the zerofiers are
+        // sparse and the domains are structured (compatibly).
+        // Even zerofier on odd domain:
+        // (offset^-1 * X)^(n/2) - 1 on
+        // {(offset * omega) * omega^{2*i} | i in 0..n/2}
+        // = {omega^{n/2}-1 | i in 0..n/2} = {-2, -2, -2, ...}
+        // Odd zerofier on even domain: {omega^-i | i in 0..n/2}
+        // ((offset * omega)^-1 * X)^(n/2) - 1
+        // on {offset * omega^{2*i} | i in 0..n/2}
+        // = {omega^{-n/2} - 1 | i in 0..n/2} = {-2, -2, -2, ...}
+        // Since these values are always the same, there's no point generating
+        // them at runtime. Moreover, we need their batch-inverses in the next
+        // step.
+
+        // 3. Batch-invert zerofiers on opposite domains.
+        // The batch-inversion is actually not performed because we already know
+        // the result: {(-2)^-1, (-2)^-1, (-2)^-1, ...}.
+        const MINUS_TWO_INVERSE: BFieldElement = BFieldElement::MINUS_TWO_INVERSE;
+        let even_zerofier_on_odd_domain_inverted = vec![FF::from(MINUS_TWO_INVERSE.value()); n / 2];
+        let odd_zerofier_on_even_domain_inverted = vec![FF::from(MINUS_TWO_INVERSE.value()); n / 2];
+
+        // 4. Construct interpolation values through Hadamard products.
+        let mut odd_domain_targets = even_zerofier_on_odd_domain_inverted;
+        let mut even_domain_targets = odd_zerofier_on_even_domain_inverted;
+        for i in 0..(n / 2) {
+            even_domain_targets[i] *= values[2 * i];
+            odd_domain_targets[i] *= values[2 * i + 1];
+        }
+
+        // 5. Interpolate using recursion
+        let even_interpolant =
+            Self::fast_modular_coset_interpolate(&even_domain_targets, offset, modulus);
+        let odd_interpolant =
+            Self::fast_modular_coset_interpolate(&odd_domain_targets, offset * omega, modulus);
+
+        // 6. Multiply with zerofiers and add.
+        let interpolant = even_interpolant
+            .multiply(&preprocessed.odd_zerofiers[(n / 2).ilog2() as usize])
+            + odd_interpolant.multiply(&preprocessed.even_zerofiers[(n / 2).ilog2() as usize]);
+
+        // 7. Reduce by modulus and return.
+        interpolant.reduce(modulus)
+    }
+
+    /// Extrapolate a Reed-Solomon codeword, defined relative to a coset of the
+    /// subgroup of order n (codeword length), in new points.
+    pub fn coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        if points.len() < Self::FAST_COSET_EXTRAPOLATE_THRESHOLD {
+            Self::fast_coset_extrapolate(domain_offset, codeword, points)
+        } else {
+            Self::naive_coset_extrapolate(domain_offset, codeword, points)
+        }
+    }
+
+    fn naive_coset_extrapolate_preprocessing(points: &[FF]) -> (ZerofierTree<FF>, Vec<FF>, usize) {
+        let zerofier_tree = ZerofierTree::new_from_domain(points);
+        let (shift_coefficients, tail_length) =
+            Self::shift_factor_ntt_with_tail_size(&zerofier_tree.zerofier());
+        (zerofier_tree, shift_coefficients, tail_length)
+    }
+
+    fn naive_coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        let n = codeword.len();
+        let logn = n.ilog2();
+        let mut coefficients = codeword.to_vec();
+        intt(
+            &mut coefficients,
+            BFieldElement::primitive_root_of_unity(n as u64).unwrap(),
+            logn,
+        );
+        let interpolant =
+            Polynomial::new(coefficients).scale(FF::from(domain_offset.inverse().value()));
+        interpolant.batch_evaluate(points)
+    }
+
+    fn fast_coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        let zerofier_tree = ZerofierTree::new_from_domain(points);
+        let minimal_interpolant = Self::fast_modular_coset_interpolate(
+            codeword,
+            domain_offset,
+            &zerofier_tree.zerofier(),
+        );
+        minimal_interpolant.divide_and_conquer_batch_evaluate(&zerofier_tree)
+    }
+
+    /// Extrapolate many Reed-Solomon codewords, defined relative to the same
+    /// coset of the subgroup of order `codeword_length`, in the same set of
+    /// new points.
+    ///
+    /// # Example
+    /// ```
+    /// # use twenty_first::prelude::*;
+    /// let n = 1 << 5;
+    /// let domain_offset = bfe!(7);
+    /// let codewords = [bfe_vec![3; n], bfe_vec![2; n]].concat();
+    /// let points = bfe_vec![0, 1];
+    /// assert_eq!(
+    ///     bfe_vec![3, 3, 2, 2],
+    ///     Polynomial::<BFieldElement>::batch_coset_extrapolate(
+    ///         domain_offset,
+    ///         n,
+    ///         &codewords,
+    ///         &points
+    ///     )
+    /// );
+    /// ```
+    ///
+    /// # Panics
+    /// Panics if the `codeword_length` is not a power of two.
+    pub fn batch_coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword_length: usize,
+        codewords: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        if points.len() < Self::FAST_COSET_EXTRAPOLATE_THRESHOLD {
+            Self::batch_fast_coset_extrapolate(domain_offset, codeword_length, codewords, points)
+        } else {
+            Self::batch_naive_coset_extrapolate(domain_offset, codeword_length, codewords, points)
+        }
+    }
+
+    fn batch_fast_coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword_length: usize,
+        codewords: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        let n = codeword_length;
+
+        let zerofier_tree = ZerofierTree::new_from_domain(points);
+        let modulus = zerofier_tree.zerofier();
+        let preprocessing_data = Self::fast_modular_coset_interpolate_preprocess(
+            codeword_length,
+            domain_offset,
+            &modulus,
+        );
+
+        (0..codewords.len() / n)
+            .flat_map(|i| {
+                let codeword = &codewords[i * n..(i + 1) * n];
+                let minimal_interpolant =
+                    Self::fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple(
+                        codeword,
+                        domain_offset,
+                        &modulus,
+                        &preprocessing_data,
+                    );
+                minimal_interpolant.divide_and_conquer_batch_evaluate(&zerofier_tree)
+            })
+            .collect()
+    }
+
+    fn batch_naive_coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword_length: usize,
+        codewords: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        let (zerofier_tree, shift_coefficients, tail_length) =
+            Self::naive_coset_extrapolate_preprocessing(points);
+        let n = codeword_length;
+        let logn = n.ilog2();
+
+        (0..codewords.len() / n)
+            .flat_map(|i| {
+                let mut coefficients = codewords[i * n..(i + 1) * n].to_vec();
+                intt(
+                    &mut coefficients,
+                    BFieldElement::primitive_root_of_unity(n as u64).unwrap(),
+                    logn,
+                );
+                Polynomial::new(coefficients)
+                    .scale(FF::from(domain_offset.inverse().value()))
+                    .reduce_by_ntt_friendly_modulus(&shift_coefficients, tail_length)
+                    .divide_and_conquer_batch_evaluate(&zerofier_tree)
+            })
+            .collect()
+    }
+
+    /// Parallel version of [`batch_coset_extrapolate`](Self::batch_coset_extrapolate).
+    pub fn par_batch_coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword_length: usize,
+        codewords: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        if points.len() < Self::FAST_COSET_EXTRAPOLATE_THRESHOLD {
+            Self::par_batch_fast_coset_extrapolate(
+                domain_offset,
+                codeword_length,
+                codewords,
+                points,
+            )
+        } else {
+            Self::par_batch_naive_coset_extrapolate(
+                domain_offset,
+                codeword_length,
+                codewords,
+                points,
+            )
+        }
+    }
+
+    fn par_batch_fast_coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword_length: usize,
+        codewords: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        let n = codeword_length;
+
+        let zerofier_tree = ZerofierTree::new_from_domain(points);
+        let modulus = zerofier_tree.zerofier();
+        let preprocessing_data = Self::fast_modular_coset_interpolate_preprocess(
+            codeword_length,
+            domain_offset,
+            &modulus,
+        );
+
+        (0..codewords.len() / n)
+            .into_par_iter()
+            .flat_map(|i| {
+                let codeword = &codewords[i * n..(i + 1) * n];
+                let minimal_interpolant =
+                    Self::fast_modular_coset_interpolate_with_zerofiers_and_ntt_friendly_multiple(
+                        codeword,
+                        domain_offset,
+                        &modulus,
+                        &preprocessing_data,
+                    );
+                minimal_interpolant.divide_and_conquer_batch_evaluate(&zerofier_tree)
+            })
+            .collect()
+    }
+
+    fn par_batch_naive_coset_extrapolate(
+        domain_offset: BFieldElement,
+        codeword_length: usize,
+        codewords: &[FF],
+        points: &[FF],
+    ) -> Vec<FF> {
+        let (zerofier_tree, shift_coefficients, tail_length) =
+            Self::naive_coset_extrapolate_preprocessing(points);
+        let n = codeword_length;
+        let logn = n.ilog2();
+
+        (0..codewords.len() / n)
+            .into_par_iter()
+            .flat_map(|i| {
+                let mut coefficients = codewords[i * n..(i + 1) * n].to_vec();
+                intt(
+                    &mut coefficients,
+                    BFieldElement::primitive_root_of_unity(n as u64).unwrap(),
+                    logn,
+                );
+                Polynomial::new(coefficients)
+                    .scale(FF::from(domain_offset.inverse().value()))
+                    .reduce_by_ntt_friendly_modulus(&shift_coefficients, tail_length)
+                    .divide_and_conquer_batch_evaluate(&zerofier_tree)
+            })
+            .collect()
+    }
 }
 
 impl Polynomial<BFieldElement> {
@@ -2577,6 +3037,23 @@ mod test_polynomials {
         prop_assert_eq!(a * b, product);
     }
 
+    #[proptest]
+    fn batch_multiply_agrees_with_iterative_multiply(a: Vec<Polynomial<BFieldElement>>) {
+        let mut acc = Polynomial::one();
+        for factor in &a {
+            acc = acc.multiply(factor);
+        }
+        prop_assert_eq!(acc, Polynomial::batch_multiply(&a));
+    }
+
+    #[proptest]
+    fn par_batch_multiply_agrees_with_batch_multiply(a: Vec<Polynomial<BFieldElement>>) {
+        prop_assert_eq!(
+            Polynomial::batch_multiply(&a),
+            Polynomial::par_batch_multiply(&a)
+        );
+    }
+
     #[proptest(cases = 50)]
     fn naive_zerofier_and_fast_zerofier_are_identical(
         #[any(size_range(..Polynomial::<BFieldElement>::FAST_ZEROFIER_CUTOFF_THRESHOLD * 2).lift())]
@@ -2630,6 +3107,13 @@ mod test_polynomials {
             zerofier.leading_coefficient().unwrap()
         );
     }
+    #[proptest]
+    fn par_zerofier_agrees_with_zerofier(domain: Vec<BFieldElement>) {
+        prop_assert_eq!(
+            Polynomial::zerofier(&domain),
+            Polynomial::par_zerofier(&domain)
+        );
+    }
 
     #[test]
     fn fast_evaluate_on_hardcoded_domain_and_polynomial() {
@@ -2651,16 +3135,6 @@ mod test_polynomials {
         prop_assert_eq!(evaluations, fast_evaluations);
     }
 
-    #[proptest]
-    fn slow_and_vector_batch_polynomial_evaluation_are_equivalent(
-        poly: Polynomial<BFieldElement>,
-        #[any(size_range(..1024).lift())] domain: Vec<BFieldElement>,
-    ) {
-        let evaluations = domain.iter().map(|&x| poly.evaluate(x)).collect_vec();
-        let vector_batch_evaluations = poly.vector_batch_evaluate(&domain);
-        prop_assert_eq!(evaluations, vector_batch_evaluations);
-    }
-
     #[test]
     #[should_panic(expected = "zero points")]
     fn interpolation_through_no_points_is_impossible() {
@@ -2848,15 +3322,6 @@ mod test_polynomials {
         prop_assert_eq!(a, quot * b);
     }
 
-    #[proptest]
-    fn naive_division_and_fast_division_are_equivalent(
-        a: Polynomial<BFieldElement>,
-        #[filter(!#b.is_zero())] b: Polynomial<BFieldElement>,
-    ) {
-        let (quotient, _remainder) = a.fast_divide(&b);
-        prop_assert_eq!(a / b, quotient);
-    }
-
     #[proptest]
     fn clean_division_agrees_with_divide_on_clean_division(
         #[strategy(arb())] a: Polynomial<BFieldElement>,
@@ -3207,7 +3672,7 @@ mod test_polynomials {
 
     #[test]
     fn formal_power_series_inverse_newton_concrete() {
-        let f = Polynomial::<BFieldElement>::new(vec![
+        let f = Polynomial::new(vec![
             BFieldElement::new(3618372803227210457),
             BFieldElement::new(14620511201754172786),
             BFieldElement::new(2577803283145951105),
@@ -3253,7 +3718,7 @@ mod test_polynomials {
         #[strategy(vec(arb(), 1..30))]
         coefficients: Vec<BFieldElement>,
     ) {
-        let polynomial = Polynomial::<BFieldElement>::new(coefficients);
+        let polynomial = Polynomial::new(coefficients);
         let multiple = polynomial.structured_multiple();
         let (_quotient, remainder) = multiple.divide(&polynomial);
         prop_assert!(remainder.is_zero());
@@ -3265,7 +3730,7 @@ mod test_polynomials {
         #[strategy(vec(arb(), 1..30))]
         coefficients: Vec<BFieldElement>,
     ) {
-        let polynomial = Polynomial::<BFieldElement>::new(coefficients);
+        let polynomial = Polynomial::new(coefficients);
         let n = polynomial.degree() as usize;
         let structured_multiple = polynomial.structured_multiple();
         assert!(structured_multiple.degree() as usize <= 2 * n);
@@ -3280,17 +3745,17 @@ mod test_polynomials {
         let remainder = structured_multiple.reduce_long_division(&x2n);
         assert_eq!(remainder.degree() as usize, n - 1);
         assert_eq!(
+            0,
             (structured_multiple.clone() - remainder.clone())
                 .reverse()
                 .degree() as usize,
-            0
         );
         assert_eq!(
+            BFieldElement::new(1),
             *(structured_multiple.clone() - remainder)
                 .coefficients
                 .last()
                 .unwrap(),
-            BFieldElement::new(1)
         );
     }
 
@@ -3314,7 +3779,7 @@ mod test_polynomials {
             .concat(),
         );
         let (_quotient, remainder) = structured_multiple.divide(&x2n);
-        assert_eq!(remainder.degree() as usize, n - 1);
+        assert_eq!(n - 1, remainder.degree() as usize);
         assert_eq!(
             (structured_multiple.clone() - remainder.clone())
                 .reverse()
@@ -3337,7 +3802,7 @@ mod test_polynomials {
         #[strategy(vec(arb(), 1..usize::min(30, #n)))]
         coefficients: Vec<BFieldElement>,
     ) {
-        let polynomial = Polynomial::<BFieldElement>::new(coefficients);
+        let polynomial = Polynomial::new(coefficients);
         let multiple = polynomial.structured_multiple_of_degree(n);
         let remainder = multiple.reduce_long_division(&polynomial);
         prop_assert!(remainder.is_zero());
@@ -3349,7 +3814,7 @@ mod test_polynomials {
         #[strategy(vec(arb(), 3..usize::min(30, #n)))] mut coefficients: Vec<BFieldElement>,
     ) {
         *coefficients.last_mut().unwrap() = BFieldElement::new(1);
-        let polynomial = Polynomial::<BFieldElement>::new(coefficients);
+        let polynomial = Polynomial::new(coefficients);
         let structured_multiple = polynomial.structured_multiple_of_degree(n);
 
         let xn = Polynomial::new(
@@ -3382,7 +3847,7 @@ mod test_polynomials {
         #[strategy(vec(arb(), 1..usize::min(30, #n)))]
         coefficients: Vec<BFieldElement>,
     ) {
-        let polynomial = Polynomial::<BFieldElement>::new(coefficients);
+        let polynomial = Polynomial::new(coefficients);
         let multiple = polynomial.structured_multiple_of_degree(n);
         prop_assert_eq!(
             multiple.degree() as usize,
@@ -3397,8 +3862,7 @@ mod test_polynomials {
     fn reverse_polynomial_with_nonzero_constant_term_twice_gives_original_back(
         f: Polynomial<BFieldElement>,
     ) {
-        let fx_plus_1 = f.shift_coefficients(1)
-            + Polynomial::<BFieldElement>::from_constant(BFieldElement::new(1));
+        let fx_plus_1 = f.shift_coefficients(1) + Polynomial::from_constant(bfe!(1));
         prop_assert_eq!(fx_plus_1.clone(), fx_plus_1.reverse().reverse());
     }
 
@@ -3539,7 +4003,8 @@ mod test_polynomials {
     ) {
         let polynomial = Polynomial::new(coefficients);
         let evaluations_iterative = polynomial.iterative_batch_evaluate(&domain);
-        let evaluations_fast = polynomial.divide_and_conquer_batch_evaluate(&domain);
+        let zerofier_tree = ZerofierTree::new_from_domain(&domain);
+        let evaluations_fast = polynomial.divide_and_conquer_batch_evaluate(&zerofier_tree);
         let evaluations_reduce_then = polynomial.reduce_then_batch_evaluate(&domain);
 
         prop_assert_eq!(evaluations_iterative.clone(), evaluations_fast);
@@ -3553,4 +4018,175 @@ mod test_polynomials {
     ) {
         prop_assert_eq!(a.divide(&b).1, a.reduce(&b));
     }
+
+    #[proptest]
+    fn structured_multiple_of_monomial_term_is_actually_multiple_and_of_right_degree(
+        #[strategy(1usize..1000)] degree: usize,
+        #[filter(!#leading_coefficient.is_zero())] leading_coefficient: BFieldElement,
+        #[strategy(#degree+1..#degree+200)] target_degree: usize,
+    ) {
+        let coefficients = [bfe_vec![0; degree], vec![leading_coefficient]].concat();
+        let polynomial = Polynomial::new(coefficients);
+        let multiple = polynomial.structured_multiple_of_degree(target_degree);
+        prop_assert_eq!(Polynomial::zero(), multiple.reduce(&polynomial));
+        prop_assert_eq!(multiple.degree() as usize, target_degree);
+    }
+
+    #[proptest]
+    fn monomial_term_divided_by_smaller_monomial_term_gives_clean_division(
+        #[strategy(100usize..102)] high_degree: usize,
+        #[filter(!#high_lc.is_zero())] high_lc: BFieldElement,
+        #[strategy(83..#high_degree)] low_degree: usize,
+        #[filter(!#low_lc.is_zero())] low_lc: BFieldElement,
+    ) {
+        let numerator = Polynomial::new([bfe_vec![0; high_degree], vec![high_lc]].concat());
+        let denominator = Polynomial::new([bfe_vec![0; low_degree], vec![low_lc]].concat());
+        let (quotient, remainder) = numerator.divide(&denominator);
+        prop_assert_eq!(
+            quotient
+                .coefficients
+                .iter()
+                .filter(|c| !c.is_zero())
+                .count(),
+            1
+        );
+        prop_assert_eq!(Polynomial::zero(), remainder);
+    }
+
+    #[proptest]
+    fn fast_modular_coset_interpolate_agrees_with_interpolate_then_reduce_property(
+        #[filter(!#modulus.is_zero())] modulus: Polynomial<BFieldElement>,
+        #[strategy(0usize..10)] _logn: usize,
+        #[strategy(Just(1 << #_logn))] n: usize,
+        #[strategy(vec(arb(), #n))] values: Vec<BFieldElement>,
+        #[strategy(arb())] offset: BFieldElement,
+    ) {
+        let omega = BFieldElement::primitive_root_of_unity(n as u64).unwrap();
+        let domain = (0..n)
+            .scan(offset, |acc: &mut BFieldElement, _| {
+                let yld = *acc;
+                *acc *= omega;
+                Some(yld)
+            })
+            .collect_vec();
+        prop_assert_eq!(
+            Polynomial::fast_modular_coset_interpolate(&values, offset, &modulus),
+            Polynomial::interpolate(&domain, &values).reduce(&modulus)
+        )
+    }
+
+    #[test]
+    fn fast_modular_coset_interpolate_agrees_with_interpolate_then_reduce_concrete() {
+        let logn = 8;
+        let n = 1u64 << logn;
+        let modulus = Polynomial::new(bfe_vec![2, 3, 1]);
+        let values = (0..n).map(|i| BFieldElement::new(i / 5)).collect_vec();
+        let offset = BFieldElement::new(7);
+
+        let omega = BFieldElement::primitive_root_of_unity(n).unwrap();
+        let mut domain = bfe_vec![0; n as usize];
+        domain[0] = offset;
+        for i in 1..n as usize {
+            domain[i] = domain[i - 1] * omega;
+        }
+        assert_eq!(
+            Polynomial::interpolate(&domain, &values).reduce(&modulus),
+            Polynomial::fast_modular_coset_interpolate(&values, offset, &modulus),
+        )
+    }
+
+    #[proptest]
+    fn coset_extrapolation_methods_agree_with_interpolate_then_evaluate(
+        #[strategy(0usize..10)] _logn: usize,
+        #[strategy(Just(1 << #_logn))] n: usize,
+        #[strategy(vec(arb(), #n))] values: Vec<BFieldElement>,
+        #[strategy(arb())] offset: BFieldElement,
+        #[strategy(vec(arb(), 1..1000))] points: Vec<BFieldElement>,
+    ) {
+        let omega = BFieldElement::primitive_root_of_unity(n as u64).unwrap();
+        let domain = (0..n)
+            .scan(offset, |acc: &mut BFieldElement, _| {
+                let yld = *acc;
+                *acc *= omega;
+                Some(yld)
+            })
+            .collect_vec();
+        let fast_coset_extrapolation = Polynomial::fast_coset_extrapolate(offset, &values, &points);
+        let naive_coset_extrapolation =
+            Polynomial::naive_coset_extrapolate(offset, &values, &points);
+        let interpolation_then_evaluation =
+            Polynomial::interpolate(&domain, &values).batch_evaluate(&points);
+        prop_assert_eq!(fast_coset_extrapolation.clone(), naive_coset_extrapolation);
+        prop_assert_eq!(fast_coset_extrapolation, interpolation_then_evaluation);
+    }
+
+    #[proptest]
+    fn coset_extrapolate_and_batch_coset_extrapolate_agree(
+        #[strategy(1usize..10)] _logn: usize,
+        #[strategy(Just(1<<#_logn))] n: usize,
+        #[strategy(0usize..5)] _m: usize,
+        #[strategy(vec(arb(), #_m*#n))] codewords: Vec<BFieldElement>,
+        #[strategy(vec(arb(), 0..20))] points: Vec<BFieldElement>,
+    ) {
+        let offset = BFieldElement::new(7);
+
+        let one_by_one_dispatch = codewords
+            .chunks(n)
+            .flat_map(|chunk| Polynomial::coset_extrapolate(offset, chunk, &points))
+            .collect_vec();
+        let batched_dispatch = Polynomial::batch_coset_extrapolate(offset, n, &codewords, &points);
+        let par_batched_dispatch =
+            Polynomial::par_batch_coset_extrapolate(offset, n, &codewords, &points);
+        prop_assert_eq!(one_by_one_dispatch.clone(), batched_dispatch);
+        prop_assert_eq!(one_by_one_dispatch, par_batched_dispatch);
+
+        let one_by_one_fast = codewords
+            .chunks(n)
+            .flat_map(|chunk| Polynomial::fast_coset_extrapolate(offset, chunk, &points))
+            .collect_vec();
+        let batched_fast = Polynomial::batch_fast_coset_extrapolate(offset, n, &codewords, &points);
+        let par_batched_fast =
+            Polynomial::par_batch_fast_coset_extrapolate(offset, n, &codewords, &points);
+        prop_assert_eq!(one_by_one_fast.clone(), batched_fast);
+        prop_assert_eq!(one_by_one_fast, par_batched_fast);
+
+        let one_by_one_naive = codewords
+            .chunks(n)
+            .flat_map(|chunk| Polynomial::naive_coset_extrapolate(offset, chunk, &points))
+            .collect_vec();
+        let batched_naive =
+            Polynomial::batch_naive_coset_extrapolate(offset, n, &codewords, &points);
+        let par_batched_naive =
+            Polynomial::par_batch_naive_coset_extrapolate(offset, n, &codewords, &points);
+        prop_assert_eq!(one_by_one_naive.clone(), batched_naive);
+        prop_assert_eq!(one_by_one_naive, par_batched_naive);
+    }
+
+    #[test]
+    fn fast_modular_coset_interpolate_thresholds_relate_properly() {
+        let intt = Polynomial::<BFieldElement>::FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_INTT;
+        let lagrange = Polynomial::<BFieldElement>::FAST_MODULAR_COSET_INTERPOLATE_CUTOFF_THRESHOLD_PREFER_LAGRANGE;
+        assert!(intt > lagrange);
+    }
+
+    #[proptest]
+    fn interpolate_and_par_interpolate_agree(
+        #[filter(#points.len() > 0)] points: Vec<BFieldElement>,
+        #[strategy(vec(arb(), #points.len()))] domain: Vec<BFieldElement>,
+    ) {
+        let expected_interpolant = Polynomial::interpolate(&domain, &points);
+        let observed_interpolant = Polynomial::par_interpolate(&domain, &points);
+        prop_assert_eq!(expected_interpolant, observed_interpolant);
+    }
+
+    #[proptest]
+    fn batch_evaluate_agrees_with_par_batch_evalaute(
+        polynomial: Polynomial<BFieldElement>,
+        points: Vec<BFieldElement>,
+    ) {
+        prop_assert_eq!(
+            polynomial.batch_evaluate(&points),
+            polynomial.par_batch_evaluate(&points)
+        );
+    }
 }
diff --git a/twenty-first/src/math/zerofier_tree.rs b/twenty-first/src/math/zerofier_tree.rs
new file mode 100644
index 000000000..7b7b7ffce
--- /dev/null
+++ b/twenty-first/src/math/zerofier_tree.rs
@@ -0,0 +1,160 @@
+use std::collections::VecDeque;
+use std::ops::MulAssign;
+
+use num_traits::One;
+
+use super::b_field_element::BFieldElement;
+use super::polynomial::Polynomial;
+use super::traits::FiniteField;
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct Leaf<FF: FiniteField + MulAssign<BFieldElement>> {
+    pub(crate) points: Vec<FF>,
+    zerofier: Polynomial<FF>,
+}
+
+impl<FF> Leaf<FF>
+where
+    FF: FiniteField + MulAssign<BFieldElement>,
+{
+    pub fn new(points: Vec<FF>) -> Self {
+        let zerofier = Polynomial::zerofier(&points);
+        Self { points, zerofier }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct Branch<FF: FiniteField + MulAssign<BFieldElement>> {
+    zerofier: Polynomial<FF>,
+    pub(crate) left: ZerofierTree<FF>,
+    pub(crate) right: ZerofierTree<FF>,
+}
+
+impl<FF> Branch<FF>
+where
+    FF: FiniteField + MulAssign<BFieldElement>,
+{
+    pub fn new(left: ZerofierTree<FF>, right: ZerofierTree<FF>) -> Self {
+        let zerofier = left.zerofier().multiply(&right.zerofier());
+        Self {
+            zerofier,
+            left,
+            right,
+        }
+    }
+}
+
+/// A zerofier tree is a balanced binary tree of vanishing polynomials.
+/// Conceptually, every leaf corresponds to a single point, and the value of
+/// that leaf is the monic linear polynomial that evaluates to zero there and
+/// no-where else. Every non-leaf node is the product of its two children.
+/// In practice, it makes sense to truncate the tree depth, in which case every
+/// leaf contains a chunk of points whose size is upper-bounded and more or less
+/// equal to some constant threshold.
+#[derive(Debug, Clone, PartialEq)]
+pub enum ZerofierTree<FF: FiniteField + MulAssign<BFieldElement>> {
+    Leaf(Leaf<FF>),
+    Branch(Box<Branch<FF>>),
+    Padding,
+}
+
+impl<FF: FiniteField + MulAssign<BFieldElement>> ZerofierTree<FF> {
+    /// Regulates the depth at which the tree is truncated. Phrased differently,
+    /// regulates the number of points contained by each leaf.
+    const RECURSION_CUTOFF_THRESHOLD: usize = 16;
+
+    pub fn new_from_domain(domain: &[FF]) -> Self {
+        let mut nodes = domain
+            .chunks(Self::RECURSION_CUTOFF_THRESHOLD)
+            .map(|chunk| {
+                let leaf = Leaf::new(chunk.to_vec());
+                ZerofierTree::Leaf(leaf)
+            })
+            .collect::<VecDeque<_>>();
+        nodes.resize(nodes.len().next_power_of_two(), ZerofierTree::Padding);
+        while nodes.len() > 1 {
+            let right = nodes.pop_back().unwrap();
+            let left = nodes.pop_back().unwrap();
+            if left == ZerofierTree::Padding {
+                nodes.push_front(ZerofierTree::Padding);
+            } else {
+                let new_node = Branch::new(left, right);
+                nodes.push_front(ZerofierTree::Branch(Box::new(new_node)));
+            }
+        }
+        nodes.pop_front().unwrap()
+    }
+
+    pub fn zerofier(&self) -> Polynomial<FF> {
+        match self {
+            ZerofierTree::Leaf(leaf) => leaf.zerofier.clone(),
+            ZerofierTree::Branch(branch) => branch.zerofier.clone(),
+            ZerofierTree::Padding => Polynomial::<FF>::one(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use num_traits::Zero;
+    use proptest::collection::vec;
+    use proptest::prop_assert_eq;
+    use proptest_arbitrary_interop::arb;
+    use test_strategy::proptest;
+
+    use crate::math::zerofier_tree::ZerofierTree;
+    use crate::prelude::BFieldElement;
+    use crate::prelude::Polynomial;
+
+    #[test]
+    fn zerofier_tree_can_be_empty() {
+        ZerofierTree::<BFieldElement>::new_from_domain(&[]);
+    }
+    #[proptest]
+    fn zerofier_tree_root_is_multiple_of_children(
+        #[strategy(vec(arb(), 2*ZerofierTree::<BFieldElement>::RECURSION_CUTOFF_THRESHOLD))]
+        points: Vec<BFieldElement>,
+    ) {
+        let zerofier_tree = ZerofierTree::new_from_domain(&points);
+        let ZerofierTree::Branch(ref branch) = &zerofier_tree else {
+            panic!("not enough leafs");
+        };
+        prop_assert_eq!(
+            Polynomial::zero(),
+            zerofier_tree.zerofier().reduce(&branch.left.zerofier())
+        );
+        prop_assert_eq!(
+            Polynomial::zero(),
+            zerofier_tree.zerofier().reduce(&branch.right.zerofier())
+        );
+    }
+
+    #[proptest]
+    fn zerofier_tree_root_has_right_degree(
+        #[strategy(vec(arb(), 1..(1<<10)))] points: Vec<BFieldElement>,
+    ) {
+        let zerofier_tree = ZerofierTree::new_from_domain(&points);
+        prop_assert_eq!(points.len(), zerofier_tree.zerofier().degree() as usize);
+    }
+
+    #[proptest]
+    fn zerofier_tree_root_zerofies(
+        #[strategy(vec(arb(), 1..(1<<10)))] points: Vec<BFieldElement>,
+        #[strategy(0usize..#points.len())] index: usize,
+    ) {
+        let zerofier_tree = ZerofierTree::new_from_domain(&points);
+        prop_assert_eq!(
+            BFieldElement::zero(),
+            zerofier_tree.zerofier().evaluate(points[index])
+        );
+    }
+
+    #[proptest]
+    fn zerofier_tree_and_polynomial_agree_on_zerofiers(
+        #[strategy(vec(arb(), 1..(1<<10)))] points: Vec<BFieldElement>,
+    ) {
+        let zerofier_tree = ZerofierTree::new_from_domain(&points);
+        let polynomial_zerofier = Polynomial::zerofier(&points);
+        prop_assert_eq!(polynomial_zerofier, zerofier_tree.zerofier());
+    }
+}