briansmith
diff --git a/‎crypto/curve25519/curve25519.c
+106-41 b/‎crypto/curve25519/curve25519.c
+106-41
@@ -319,11 +319,6 @@ static void fe_copy(fe *h, const fe *f) {
 static void fe_copy_lt(fe_loose *h, const fe *f) {
   fe_limbs_copy(h->v, f->v);
 }
-#if !defined(OPENSSL_SMALL)
-static void fe_copy_ll(fe_loose *h, const fe_loose *f) {
-  fe_limbs_copy(h->v, f->v);
-}
-#endif // !defined(OPENSSL_SMALL)
 
 static void fe_loose_invert(fe *out, const fe_loose *z) {
   fe t0;
@@ -532,12 +527,23 @@ static void ge_p3_0(ge_p3 *h) {
   fe_0(&h->T);
 }
 
+static void ge_cached_0(ge_cached *h) {
+  fe_loose_1(&h->YplusX);
+  fe_loose_1(&h->YminusX);
+  fe_loose_1(&h->Z);
+  fe_loose_0(&h->T2d);
+}
+
+#if defined(OPENSSL_SMALL)
+
 static void ge_precomp_0(ge_precomp *h) {
   fe_loose_1(&h->yplusx);
   fe_loose_1(&h->yminusx);
   fe_loose_0(&h->xy2d);
 }
 
+#endif
+
 // r = p
 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
   fe_copy(&r->X, &p->X);
@@ -568,6 +574,13 @@ static void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
   fe_mul_tll(&r->T, &p->X, &p->Y);
 }
 
+// r = p
+static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
+  ge_p3 t;
+  x25519_ge_p1p1_to_p3(&t, p);
+  x25519_ge_p3_to_cached(r, &t);
+}
+
 // r = 2 * p
 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
   fe trX, trZ, trT;
@@ -664,16 +677,6 @@ static void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
   fe_add(&r->T, &trZ, &trT);
 }
 
-static uint8_t equal(signed char b, signed char c) {
-  uint8_t ub = b;
-  uint8_t uc = c;
-  uint8_t x = ub ^ uc;  // 0: yes; 1..255: no
-  uint32_t y = x;       // 0: yes; 1..255: no
-  y -= 1;               // 4294967295: yes; 0..254: no
-  y >>= 31;             // 1: yes; 0: no
-  return y;
-}
-
 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
   fe_cmov(&t->yplusx, &u->yplusx, b);
   fe_cmov(&t->yminusx, &u->yminusx, b);
@@ -722,7 +725,7 @@ static void x25519_ge_scalarmult_small_precomp(
     ge_precomp_0(&e);
 
     for (j = 1; j < 16; j++) {
-      cmov(&e, &multiples[j-1], equal(index, j));
+      cmov(&e, &multiples[j-1], 1&constant_time_eq_w(index, j));
     }
 
     ge_cached cached;
@@ -742,35 +745,36 @@ void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
 
 #else
 
-static uint8_t negative(signed char b) {
-  uint32_t x = b;
-  x >>= 31;  // 1: yes; 0: no
-  return x;
-}
+static void table_select(ge_precomp *t, const int pos, const signed char b) {
+  uint8_t bnegative = constant_time_msb_w(b);
+  uint8_t babs = b - ((bnegative & b) << 1);
 
-static void table_select(ge_precomp *t, int pos, signed char b) {
-  ge_precomp minust;
-  uint8_t bnegative = negative(b);
-  uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);
+  uint8_t t_bytes[3][32] = {
+      {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
+#if defined(__clang__) // materialize for vectorization, 6% speedup
+  __asm__("" : "+m" (t_bytes) : /*no inputs*/);
+#endif
+  static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
+  for (int i = 0; i < 8; i++) {
+    constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
+                                     sizeof(t_bytes),
+                                     constant_time_eq_w(babs, 1 + i));
+  }
 
-  ge_precomp_0(t);
-  cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
-  cmov(t, &k25519Precomp[pos][1], equal(babs, 2));
-  cmov(t, &k25519Precomp[pos][2], equal(babs, 3));
-  cmov(t, &k25519Precomp[pos][3], equal(babs, 4));
-  cmov(t, &k25519Precomp[pos][4], equal(babs, 5));
-  cmov(t, &k25519Precomp[pos][5], equal(babs, 6));
-  cmov(t, &k25519Precomp[pos][6], equal(babs, 7));
-  cmov(t, &k25519Precomp[pos][7], equal(babs, 8));
-  fe_copy_ll(&minust.yplusx, &t->yminusx);
-  fe_copy_ll(&minust.yminusx, &t->yplusx);
+  fe yplusx, yminusx, xy2d;
+  fe_frombytes_strict(&yplusx, t_bytes[0]);
+  fe_frombytes_strict(&yminusx, t_bytes[1]);
+  fe_frombytes_strict(&xy2d, t_bytes[2]);
 
-  // NOTE: the input table is canonical, but types don't encode it
-  fe tmp;
-  fe_carry(&tmp, &t->xy2d);
-  fe_neg(&minust.xy2d, &tmp);
+  fe_copy_lt(&t->yplusx, &yplusx);
+  fe_copy_lt(&t->yminusx, &yminusx);
+  fe_copy_lt(&t->xy2d, &xy2d);
 
-  cmov(t, &minust, bnegative);
+  ge_precomp minust;
+  fe_copy_lt(&minust.yplusx, &yminusx);
+  fe_copy_lt(&minust.yminusx, &yplusx);
+  fe_neg(&minust.xy2d, &xy2d);
+  cmov(t, &minust, bnegative>>7);
 }
 
 // h = a * B
@@ -829,6 +833,67 @@ void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
 
 #endif
 
+static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
+  fe_cmov(&t->YplusX, &u->YplusX, b);
+  fe_cmov(&t->YminusX, &u->YminusX, b);
+  fe_cmov(&t->Z, &u->Z, b);
+  fe_cmov(&t->T2d, &u->T2d, b);
+}
+
+// r = scalar * A.
+// where a = a[0]+256*a[1]+...+256^31 a[31].
+void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
+  ge_p2 Ai_p2[8];
+  ge_cached Ai[16];
+  ge_p1p1 t;
+
+  ge_cached_0(&Ai[0]);
+  x25519_ge_p3_to_cached(&Ai[1], A);
+  ge_p3_to_p2(&Ai_p2[1], A);
+
+  unsigned i;
+  for (i = 2; i < 16; i += 2) {
+    ge_p2_dbl(&t, &Ai_p2[i / 2]);
+    ge_p1p1_to_cached(&Ai[i], &t);
+    if (i < 8) {
+      x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
+    }
+    x25519_ge_add(&t, A, &Ai[i]);
+    ge_p1p1_to_cached(&Ai[i + 1], &t);
+    if (i < 7) {
+      x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
+    }
+  }
+
+  ge_p2_0(r);
+  ge_p3 u;
+
+  for (i = 0; i < 256; i += 4) {
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p3(&u, &t);
+
+    uint8_t index = scalar[31 - i/8];
+    index >>= 4 - (i & 4);
+    index &= 0xf;
+
+    unsigned j;
+    ge_cached selected;
+    ge_cached_0(&selected);
+    for (j = 0; j < 16; j++) {
+      cmov_cached(&selected, &Ai[j], 1&constant_time_eq_w(index, j));
+    }
+
+    x25519_ge_add(&t, &u, &selected);
+    x25519_ge_p1p1_to_p2(r, &t);
+  }
+}
+
 static void slide(signed char *r, const uint8_t *a) {
   int i;
   int b;