diff --git a/Cargo.toml b/Cargo.toml
index d2dd40127b..670b59c426 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,12 +65,14 @@ include = [
     "crypto/fipsmodule/bn/internal.h",
     "crypto/fipsmodule/bn/montgomery.c",
     "crypto/fipsmodule/bn/montgomery_inv.c",
+    "crypto/fipsmodule/bn/shift.c",
     "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl",
     "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl",
     "crypto/fipsmodule/ec/ecp_nistz.c",
     "crypto/fipsmodule/ec/ecp_nistz.h",
     "crypto/fipsmodule/ec/ecp_nistz384.h",
     "crypto/fipsmodule/ec/ecp_nistz384.inl",
+    "crypto/fipsmodule/ec/internal.h",
     "crypto/fipsmodule/ec/gfp_p256.c",
     "crypto/fipsmodule/ec/gfp_p384.c",
     "crypto/fipsmodule/ec/p256.c",
@@ -80,6 +82,7 @@ include = [
     "crypto/fipsmodule/ec/p256_shared.h",
     "crypto/fipsmodule/ec/p256_table.h",
     "crypto/fipsmodule/ec/util.h",
+    "crypto/fipsmodule/ec/wnaf.c",
     "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
     "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl",
     "crypto/fipsmodule/modes/asm/ghash-armv4.pl",
diff --git a/build.rs b/build.rs
index 3cbae9d926..0446e3ec10 100644
--- a/build.rs
+++ b/build.rs
@@ -38,10 +38,12 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[], "crypto/fipsmodule/aes/aes_nohw.c"),
     (&[], "crypto/fipsmodule/bn/montgomery.c"),
     (&[], "crypto/fipsmodule/bn/montgomery_inv.c"),
+    (&[], "crypto/fipsmodule/bn/shift.c"),
     (&[], "crypto/fipsmodule/ec/ecp_nistz.c"),
     (&[], "crypto/fipsmodule/ec/gfp_p256.c"),
     (&[], "crypto/fipsmodule/ec/gfp_p384.c"),
     (&[], "crypto/fipsmodule/ec/p256.c"),
+    (&[], "crypto/fipsmodule/ec/wnaf.c"),
     (&[], "crypto/limbs/limbs.c"),
     (&[], "crypto/mem.c"),
     (&[], "crypto/poly1305/poly1305.c"),
@@ -959,6 +961,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "p256_point_mul",
         "p256_point_mul_base",
         "p256_point_mul_base_vartime",
+        "p256_point_mul_public",
         "p256_scalar_mul_mont",
         "p256_scalar_sqr_rep_mont",
         "p256_sqr_mont",
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h
index cf7c807256..99c47bae39 100644
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@@ -68,7 +68,7 @@
 #ifndef OPENSSL_HEADER_EC_INTERNAL_H
 #define OPENSSL_HEADER_EC_INTERNAL_H
 
-#include <openssl/base.h>
+#include <ring-core/base.h>
 
 // ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
 // |scalar| to |out|. |out| must have room for |bits| + 1 elements, each of
@@ -78,7 +78,6 @@
 // where at most one of any  w+1  consecutive digits is non-zero
 // with the exception that the most significant digit may be only
 // w-1 zeros away from that next non-zero digit.
-void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
-                     const EC_SCALAR *scalar, size_t bits, int w);
+void ec_compute_wNAF(int8_t *out, const BN_ULONG *scalar, size_t scalar_limbs, size_t bits, int w);
 
 #endif  // OPENSSL_HEADER_EC_INTERNAL_H
diff --git a/crypto/fipsmodule/ec/p256.c b/crypto/fipsmodule/ec/p256.c
index dc67a71f07..f40fc9afd5 100644
--- a/crypto/fipsmodule/ec/p256.c
+++ b/crypto/fipsmodule/ec/p256.c
@@ -23,6 +23,7 @@
 
 #include "p256_shared.h"
 
+#include "internal.h"
 #include "../../internal.h"
 #include "./util.h"
 
@@ -473,19 +474,17 @@ void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
   fiat_p256_to_words(r[2], nq[2]);
 }
 
-#if 0
-
-static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
-                                             EC_JACOBIAN *r,
-                                             const EC_SCALAR *g_scalar,
-                                             const EC_JACOBIAN *p,
-                                             const EC_SCALAR *p_scalar) {
+void p256_point_mul_public(Limb r[3][P256_LIMBS],
+                           const Limb g_scalar[P256_LIMBS],
+                           const Limb p_scalar[P256_LIMBS],
+                           const Limb p_x[P256_LIMBS],
+                           const Limb p_y[P256_LIMBS]) {
 #define P256_WSIZE_PUBLIC 4
   // Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|.
   fiat_p256_felem p_pre_comp[1 << (P256_WSIZE_PUBLIC - 1)][3];
-  fiat_p256_from_generic(p_pre_comp[0][0], &p->X);
-  fiat_p256_from_generic(p_pre_comp[0][1], &p->Y);
-  fiat_p256_from_generic(p_pre_comp[0][2], &p->Z);
+  fiat_p256_from_words(p_pre_comp[0][0], p_x);
+  fiat_p256_from_words(p_pre_comp[0][1], p_y);
+  fiat_p256_copy(p_pre_comp[0][2], fiat_p256_one);
   fiat_p256_felem p2[3];
   fiat_p256_point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0],
                          p_pre_comp[0][1], p_pre_comp[0][2]);
@@ -498,7 +497,7 @@ static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
 
   // Set up the coefficients for |p_scalar|.
   int8_t p_wNAF[257];
-  ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC);
+  ec_compute_wNAF(p_wNAF, p_scalar, P256_LIMBS, 256, P256_WSIZE_PUBLIC);
 
   // Set |ret| to the point at infinity.
   int skip = 1;  // Save some point operations.
@@ -562,13 +561,11 @@ static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
     }
   }
 
-  fiat_p256_to_generic(&r->X, ret[0]);
-  fiat_p256_to_generic(&r->Y, ret[1]);
-  fiat_p256_to_generic(&r->Z, ret[2]);
+  fiat_p256_to_words(r[0], ret[0]);
+  fiat_p256_to_words(r[1], ret[1]);
+  fiat_p256_to_words(r[2], ret[2]);
 }
 
-#endif
-
 void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
                    const Limb b[P256_LIMBS]) {
   fiat_p256_felem a_, b_;
diff --git a/crypto/fipsmodule/ec/wnaf.c b/crypto/fipsmodule/ec/wnaf.c
index 56de6cfec5..0c815e783e 100644
--- a/crypto/fipsmodule/ec/wnaf.c
+++ b/crypto/fipsmodule/ec/wnaf.c
@@ -65,17 +65,6 @@
  * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
  * Laboratories. */
 
-#include <openssl/ec.h>
-
-#include <assert.h>
-#include <string.h>
-
-#include <openssl/bn.h>
-#include <openssl/err.h>
-#include <openssl/mem.h>
-#include <openssl/thread.h>
-
-#include "internal.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
 
@@ -85,18 +74,17 @@
 //   http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13
 //   http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf
 
-void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
-                     const EC_SCALAR *scalar, size_t bits, int w) {
+void ec_compute_wNAF(int8_t *out, const BN_ULONG scalar[], size_t scalar_limbs, size_t bits, int w) {
   // 'int8_t' can represent integers with absolute values less than 2^7.
-  assert(0 < w && w <= 7);
-  assert(bits != 0);
+  debug_assert_nonsecret(0 < w && w <= 7);
+  debug_assert_nonsecret(bits != 0);
   int bit = 1 << w;         // 2^w, at most 128
   int next_bit = bit << 1;  // 2^(w+1), at most 256
   int mask = next_bit - 1;  // at most 255
 
-  int window_val = scalar->words[0] & mask;
+  int window_val = ((int)scalar[0]) & mask;
   for (size_t j = 0; j < bits + 1; j++) {
-    assert(0 <= window_val && window_val <= next_bit);
+    debug_assert_nonsecret(0 <= window_val && window_val <= next_bit);
     int digit = 0;
     if (window_val & 1) {
       assert(0 < window_val && window_val < next_bit);
@@ -105,7 +93,7 @@ void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
         // We know -next_bit < digit < 0 and window_val - digit = next_bit.
 
         // modified wNAF
-        if (j + w + 1 >= bits) {
+        if (j + ((size_t)w) + 1 >= bits) {
           // special case for generating modified wNAFs:
           // no new bits will be added into window_val,
           // so using a positive digit here will decrease
@@ -132,17 +120,16 @@ void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
       assert(digit & 1);
     }
 
-    out[j] = digit;
+    out[j] = (int8_t)digit;
 
     // Incorporate the next bit. Previously, |window_val| <= |next_bit|, so if
     // we shift and add at most one copy of |bit|, this will continue to hold
     // afterwards.
     window_val >>= 1;
-    window_val += bit * bn_is_bit_set_words(scalar->words, group->order.N.width,
-                                            j + w + 1);
-    assert(window_val <= next_bit);
+    window_val += bit * bn_is_bit_set_words(scalar, scalar_limbs, j + (size_t)w + 1);
+    debug_assert_nonsecret(window_val <= next_bit);
   }
 
   // bits + 1 entries should be sufficient to consume all bits.
-  assert(window_val == 0);
+  debug_assert_nonsecret(window_val == 0);
 }
diff --git a/crypto/internal.h b/crypto/internal.h
index 7beb1d44fd..062ca564c6 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -150,6 +150,8 @@ typedef __int128_t int128_t;
 typedef __uint128_t uint128_t;
 #endif
 
+#define OPENSSL_ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
+
 // Pointer utility functions.
 
 // buffers_alias returns one if |a| and |b| alias and zero otherwise.
diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs
index adbed60936..104c2e10fc 100644
--- a/src/ec/suite_b/ops/p256.rs
+++ b/src/ec/suite_b/ops/p256.rs
@@ -119,9 +119,7 @@ pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
     twin_mul: twin_mul_nistz256,
 
     #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
-    twin_mul: |g_scalar, p_scalar, p_xy| {
-        twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy)
-    },
+    twin_mul: twin_mul_fiat,
 
     q_minus_n: Elem::from_hex("4319055358e8617b0c46353d039cdaae"),
 };
@@ -147,6 +145,28 @@ fn point_mul_base_vartime(g_scalar: &Scalar) -> Point {
     scaled_g
 }
 
+#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
+fn twin_mul_fiat(g_scalar: &Scalar, p_scalar: &Scalar, &(p_x, p_y): &(Elem<R>, Elem<R>)) -> Point {
+    prefixed_extern! {
+        fn p256_point_mul_public(r: *mut Limb,
+                                g_scalar: *const Limb,
+                                p_scalar: *const Limb,
+                                p_x: *const Limb,
+                                p_y: *const Limb);
+    }
+    let mut r = Point::new_at_infinity();
+    unsafe {
+        p256_point_mul_public(
+            r.xyz.as_mut_ptr(),
+            g_scalar.limbs.as_ptr(),
+            p_scalar.limbs.as_ptr(),
+            p_x.limbs.as_ptr(),
+            p_y.limbs.as_ptr(),
+        );
+    }
+    r
+}
+
 pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps {
     scalar_ops: &SCALAR_OPS,