diff --git a/src/boxed/uint.rs b/src/boxed/uint.rs
index 7d5c94377..767289e59 100644
--- a/src/boxed/uint.rs
+++ b/src/boxed/uint.rs
@@ -5,6 +5,7 @@ mod add_mod;
 mod bit_and;
 mod cmp;
 pub(crate) mod encoding;
+mod modular;
 mod mul;
 mod sub;
 mod sub_mod;
@@ -262,6 +263,14 @@ impl From<u128> for BoxedUint {
     }
 }
 
+impl From<&[Limb]> for BoxedUint {
+    fn from(limbs: &[Limb]) -> BoxedUint {
+        Self {
+            limbs: limbs.into(),
+        }
+    }
+}
+
 impl From<Box<[Limb]>> for BoxedUint {
     fn from(limbs: Box<[Limb]>) -> BoxedUint {
         Self { limbs }
diff --git a/src/boxed/uint/modular.rs b/src/boxed/uint/modular.rs
new file mode 100644
index 000000000..8b7d8cb10
--- /dev/null
+++ b/src/boxed/uint/modular.rs
@@ -0,0 +1,25 @@
+//! Modular arithmetic support for [`BoxedUint`].
+
+use super::BoxedUint;
+use crate::{uint::modular::reduction::montgomery_reduction_core, Limb};
+
+#[allow(dead_code)]
+pub(crate) fn mul_montgomery_form(
+    a: &BoxedUint,
+    b: &BoxedUint,
+    modulus: &BoxedUint,
+    mod_neg_inv: Limb,
+) -> BoxedUint {
+    debug_assert_eq!(a.nlimbs(), modulus.nlimbs());
+    debug_assert_eq!(b.nlimbs(), modulus.nlimbs());
+
+    let mut product = a.mul_wide(b);
+    let (lower, upper) = product.limbs.split_at_mut(modulus.nlimbs());
+    let meta_carry = montgomery_reduction_core(lower, upper, &modulus.limbs, mod_neg_inv);
+    let ret = BoxedUint::from(&*upper);
+
+    #[cfg(feature = "zeroize")]
+    zeroize::Zeroize::zeroize(&mut product);
+
+    ret.sub_mod_with_carry(meta_carry, modulus, modulus)
+}
diff --git a/src/boxed/uint/sub_mod.rs b/src/boxed/uint/sub_mod.rs
index 2f5804180..6f4420ef1 100644
--- a/src/boxed/uint/sub_mod.rs
+++ b/src/boxed/uint/sub_mod.rs
@@ -18,6 +18,26 @@ impl BoxedUint {
         // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the modulus.
         out.wrapping_add(&p.bitand_limb(mask))
     }
+
+    /// Returns `(self..., carry) - (rhs...) mod (p...)`, where `carry <= 1`.
+    /// Assumes `-(p...) <= (self..., carry) - (rhs...) < (p...)`.
+    #[inline(always)]
+    pub(crate) fn sub_mod_with_carry(&self, carry: Limb, rhs: &Self, p: &Self) -> Self {
+        debug_assert_eq!(self.nlimbs(), p.nlimbs());
+        debug_assert_eq!(rhs.nlimbs(), p.nlimbs());
+        debug_assert!(carry.0 <= 1);
+
+        let (out, borrow) = self.sbb(rhs, Limb::ZERO);
+
+        // The new `borrow = Word::MAX` iff `carry == 0` and `borrow == Word::MAX`.
+        let borrow = (!carry.0.wrapping_neg()) & borrow.0;
+
+        // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise
+        // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the modulus.
+        let mask = Self::from_words(vec![borrow; p.nlimbs()]);
+
+        out.wrapping_add(&p.bitand(&mask))
+    }
 }
 
 impl SubMod for BoxedUint {
diff --git a/src/uint/modular.rs b/src/uint/modular.rs
index b3e61ddbc..cb460abf1 100644
--- a/src/uint/modular.rs
+++ b/src/uint/modular.rs
@@ -17,7 +17,7 @@
 //! the modulus can vary at runtime.
 
 mod constant_mod;
-mod reduction;
+pub(crate) mod reduction;
 mod runtime_mod;
 
 mod add;
diff --git a/src/uint/modular/reduction.rs b/src/uint/modular/reduction.rs
index b206ae32f..32130a0da 100644
--- a/src/uint/modular/reduction.rs
+++ b/src/uint/modular/reduction.rs
@@ -10,46 +10,70 @@ const fn muladdcarry(x: Word, y: Word, z: Word, w: Word) -> (Word, Word) {
     ((res >> Word::BITS) as Word, res as Word)
 }
 
-/// Algorithm 14.32 in Handbook of Applied Cryptography <https://cacr.uwaterloo.ca/hac/about/chap14.pdf>
-pub const fn montgomery_reduction<const LIMBS: usize>(
-    lower_upper: &(Uint<LIMBS>, Uint<LIMBS>),
-    modulus: &Uint<LIMBS>,
-    mod_neg_inv: Limb,
-) -> Uint<LIMBS> {
-    let (mut lower, mut upper) = *lower_upper;
+/// Impl the core Montgomery reduction algorithm.
+///
+/// This is implemented as a macro to abstract over `const fn` and boxed use cases, since the latter
+/// needs mutable references and thus the unstable `const_mut_refs` feature (rust-lang/rust#57349).
+// TODO(tarcieri): change this into a `const fn` when `const_mut_refs` is stable
+macro_rules! impl_montgomery_reduction {
+    ($upper:expr, $lower:expr, $modulus:expr, $mod_neg_inv:expr, $limbs:expr) => {{
+        let mut meta_carry = Limb(0);
+        let mut new_sum;
 
-    let mut meta_carry = Limb(0);
-    let mut new_sum;
+        let mut i = 0;
+        while i < $limbs {
+            let u = $lower[i].0.wrapping_mul($mod_neg_inv.0);
 
-    let mut i = 0;
-    while i < LIMBS {
-        let u = lower.limbs[i].0.wrapping_mul(mod_neg_inv.0);
+            let (mut carry, _) = muladdcarry(u, $modulus[0].0, $lower[i].0, 0);
+            let mut new_limb;
 
-        let (mut carry, _) = muladdcarry(u, modulus.limbs[0].0, lower.limbs[i].0, 0);
-        let mut new_limb;
+            let mut j = 1;
+            while j < ($limbs - i) {
+                (carry, new_limb) = muladdcarry(u, $modulus[j].0, $lower[i + j].0, carry);
+                $lower[i + j] = Limb(new_limb);
+                j += 1;
+            }
+            while j < $limbs {
+                (carry, new_limb) = muladdcarry(u, $modulus[j].0, $upper[i + j - $limbs].0, carry);
+                $upper[i + j - $limbs] = Limb(new_limb);
+                j += 1;
+            }
 
-        let mut j = 1;
-        while j < (LIMBS - i) {
-            (carry, new_limb) = muladdcarry(u, modulus.limbs[j].0, lower.limbs[i + j].0, carry);
-            lower.limbs[i + j] = Limb(new_limb);
-            j += 1;
-        }
-        while j < LIMBS {
-            (carry, new_limb) =
-                muladdcarry(u, modulus.limbs[j].0, upper.limbs[i + j - LIMBS].0, carry);
-            upper.limbs[i + j - LIMBS] = Limb(new_limb);
-            j += 1;
+            (new_sum, meta_carry) = $upper[i].adc(Limb(carry), meta_carry);
+            $upper[i] = new_sum;
+
+            i += 1;
         }
 
-        (new_sum, meta_carry) = upper.limbs[i].adc(Limb(carry), meta_carry);
-        upper.limbs[i] = new_sum;
+        meta_carry
+    }};
+}
 
-        i += 1;
-    }
+/// Algorithm 14.32 in Handbook of Applied Cryptography <https://cacr.uwaterloo.ca/hac/about/chap14.pdf>
+pub const fn montgomery_reduction<const LIMBS: usize>(
+    lower_upper: &(Uint<LIMBS>, Uint<LIMBS>),
+    modulus: &Uint<LIMBS>,
+    mod_neg_inv: Limb,
+) -> Uint<LIMBS> {
+    let (mut lower, mut upper) = *lower_upper;
+    let meta_carry =
+        impl_montgomery_reduction!(upper.limbs, lower.limbs, &modulus.limbs, mod_neg_inv, LIMBS);
 
     // Division is simply taking the upper half of the limbs
     // Final reduction (at this point, the value is at most 2 * modulus,
     // so `meta_carry` is either 0 or 1)
-
     upper.sub_mod_with_carry(meta_carry, modulus, modulus)
 }
+
+/// Shim used by [`BoxedUint`] to perform a Montgomery reduction.
+#[cfg(feature = "alloc")]
+pub(crate) fn montgomery_reduction_core(
+    lower: &mut [Limb],
+    upper: &mut [Limb],
+    modulus: &[Limb],
+    mod_neg_inv: Limb,
+) -> Limb {
+    debug_assert_eq!(lower.len(), modulus.len());
+    debug_assert_eq!(upper.len(), modulus.len());
+    impl_montgomery_reduction!(upper, lower, modulus, mod_neg_inv, modulus.len())
+}