@@ -319,11 +319,6 @@ static void fe_copy(fe *h, const fe *f) {
319
319
static void fe_copy_lt (fe_loose * h , const fe * f ) {
320
320
fe_limbs_copy (h -> v , f -> v );
321
321
}
322
- #if !defined(OPENSSL_SMALL )
323
- static void fe_copy_ll (fe_loose * h , const fe_loose * f ) {
324
- fe_limbs_copy (h -> v , f -> v );
325
- }
326
- #endif // !defined(OPENSSL_SMALL)
327
322
328
323
static void fe_loose_invert (fe * out , const fe_loose * z ) {
329
324
fe t0 ;
@@ -532,12 +527,23 @@ static void ge_p3_0(ge_p3 *h) {
532
527
fe_0 (& h -> T );
533
528
}
534
529
530
+ static void ge_cached_0 (ge_cached * h ) {
531
+ fe_loose_1 (& h -> YplusX );
532
+ fe_loose_1 (& h -> YminusX );
533
+ fe_loose_1 (& h -> Z );
534
+ fe_loose_0 (& h -> T2d );
535
+ }
536
+
537
+ #if defined(OPENSSL_SMALL )
538
+
535
539
static void ge_precomp_0 (ge_precomp * h ) {
536
540
fe_loose_1 (& h -> yplusx );
537
541
fe_loose_1 (& h -> yminusx );
538
542
fe_loose_0 (& h -> xy2d );
539
543
}
540
544
545
+ #endif
546
+
541
547
// r = p
542
548
static void ge_p3_to_p2 (ge_p2 * r , const ge_p3 * p ) {
543
549
fe_copy (& r -> X , & p -> X );
@@ -568,6 +574,13 @@ static void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
568
574
fe_mul_tll (& r -> T , & p -> X , & p -> Y );
569
575
}
570
576
577
+ // r = p
578
+ static void ge_p1p1_to_cached (ge_cached * r , const ge_p1p1 * p ) {
579
+ ge_p3 t ;
580
+ x25519_ge_p1p1_to_p3 (& t , p );
581
+ x25519_ge_p3_to_cached (r , & t );
582
+ }
583
+
571
584
// r = 2 * p
572
585
static void ge_p2_dbl (ge_p1p1 * r , const ge_p2 * p ) {
573
586
fe trX , trZ , trT ;
@@ -664,16 +677,6 @@ static void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
664
677
fe_add (& r -> T , & trZ , & trT );
665
678
}
666
679
667
- static uint8_t equal (signed char b , signed char c ) {
668
- uint8_t ub = b ;
669
- uint8_t uc = c ;
670
- uint8_t x = ub ^ uc ; // 0: yes; 1..255: no
671
- uint32_t y = x ; // 0: yes; 1..255: no
672
- y -= 1 ; // 4294967295: yes; 0..254: no
673
- y >>= 31 ; // 1: yes; 0: no
674
- return y ;
675
- }
676
-
677
680
static void cmov (ge_precomp * t , const ge_precomp * u , uint8_t b ) {
678
681
fe_cmov (& t -> yplusx , & u -> yplusx , b );
679
682
fe_cmov (& t -> yminusx , & u -> yminusx , b );
@@ -722,7 +725,7 @@ static void x25519_ge_scalarmult_small_precomp(
722
725
ge_precomp_0 (& e );
723
726
724
727
for (j = 1 ; j < 16 ; j ++ ) {
725
- cmov (& e , & multiples [j - 1 ], equal (index , j ));
728
+ cmov (& e , & multiples [j - 1 ], 1 & constant_time_eq_w (index , j ));
726
729
}
727
730
728
731
ge_cached cached ;
@@ -742,35 +745,36 @@ void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
742
745
743
746
#else
744
747
745
- static uint8_t negative (signed char b ) {
746
- uint32_t x = b ;
747
- x >>= 31 ; // 1: yes; 0: no
748
- return x ;
749
- }
748
+ static void table_select (ge_precomp * t , const int pos , const signed char b ) {
749
+ uint8_t bnegative = constant_time_msb_w (b );
750
+ uint8_t babs = b - ((bnegative & b ) << 1 );
750
751
751
- static void table_select (ge_precomp * t , int pos , signed char b ) {
752
- ge_precomp minust ;
753
- uint8_t bnegative = negative (b );
754
- uint8_t babs = b - ((uint8_t )((- bnegative ) & b ) << 1 );
752
+ uint8_t t_bytes [3 ][32 ] = {
753
+ {constant_time_is_zero_w (b ) & 1 }, {constant_time_is_zero_w (b ) & 1 }, {0 }};
754
+ #if defined(__clang__ ) // materialize for vectorization, 6% speedup
755
+ __asm__("" : "+m" (t_bytes ) : /*no inputs*/ );
756
+ #endif
757
+ static_assert (sizeof (t_bytes ) == sizeof (k25519Precomp [pos ][0 ]), "" );
758
+ for (int i = 0 ; i < 8 ; i ++ ) {
759
+ constant_time_conditional_memxor (t_bytes , k25519Precomp [pos ][i ],
760
+ sizeof (t_bytes ),
761
+ constant_time_eq_w (babs , 1 + i ));
762
+ }
755
763
756
- ge_precomp_0 (t );
757
- cmov (t , & k25519Precomp [pos ][0 ], equal (babs , 1 ));
758
- cmov (t , & k25519Precomp [pos ][1 ], equal (babs , 2 ));
759
- cmov (t , & k25519Precomp [pos ][2 ], equal (babs , 3 ));
760
- cmov (t , & k25519Precomp [pos ][3 ], equal (babs , 4 ));
761
- cmov (t , & k25519Precomp [pos ][4 ], equal (babs , 5 ));
762
- cmov (t , & k25519Precomp [pos ][5 ], equal (babs , 6 ));
763
- cmov (t , & k25519Precomp [pos ][6 ], equal (babs , 7 ));
764
- cmov (t , & k25519Precomp [pos ][7 ], equal (babs , 8 ));
765
- fe_copy_ll (& minust .yplusx , & t -> yminusx );
766
- fe_copy_ll (& minust .yminusx , & t -> yplusx );
764
+ fe yplusx , yminusx , xy2d ;
765
+ fe_frombytes_strict (& yplusx , t_bytes [0 ]);
766
+ fe_frombytes_strict (& yminusx , t_bytes [1 ]);
767
+ fe_frombytes_strict (& xy2d , t_bytes [2 ]);
767
768
768
- // NOTE: the input table is canonical, but types don't encode it
769
- fe tmp ;
770
- fe_carry (& tmp , & t -> xy2d );
771
- fe_neg (& minust .xy2d , & tmp );
769
+ fe_copy_lt (& t -> yplusx , & yplusx );
770
+ fe_copy_lt (& t -> yminusx , & yminusx );
771
+ fe_copy_lt (& t -> xy2d , & xy2d );
772
772
773
- cmov (t , & minust , bnegative );
773
+ ge_precomp minust ;
774
+ fe_copy_lt (& minust .yplusx , & yminusx );
775
+ fe_copy_lt (& minust .yminusx , & yplusx );
776
+ fe_neg (& minust .xy2d , & xy2d );
777
+ cmov (t , & minust , bnegative >>7 );
774
778
}
775
779
776
780
// h = a * B
@@ -829,6 +833,67 @@ void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
829
833
830
834
#endif
831
835
836
+ static void cmov_cached (ge_cached * t , ge_cached * u , uint8_t b ) {
837
+ fe_cmov (& t -> YplusX , & u -> YplusX , b );
838
+ fe_cmov (& t -> YminusX , & u -> YminusX , b );
839
+ fe_cmov (& t -> Z , & u -> Z , b );
840
+ fe_cmov (& t -> T2d , & u -> T2d , b );
841
+ }
842
+
843
+ // r = scalar * A.
844
+ // where a = a[0]+256*a[1]+...+256^31 a[31].
845
+ void x25519_ge_scalarmult (ge_p2 * r , const uint8_t * scalar , const ge_p3 * A ) {
846
+ ge_p2 Ai_p2 [8 ];
847
+ ge_cached Ai [16 ];
848
+ ge_p1p1 t ;
849
+
850
+ ge_cached_0 (& Ai [0 ]);
851
+ x25519_ge_p3_to_cached (& Ai [1 ], A );
852
+ ge_p3_to_p2 (& Ai_p2 [1 ], A );
853
+
854
+ unsigned i ;
855
+ for (i = 2 ; i < 16 ; i += 2 ) {
856
+ ge_p2_dbl (& t , & Ai_p2 [i / 2 ]);
857
+ ge_p1p1_to_cached (& Ai [i ], & t );
858
+ if (i < 8 ) {
859
+ x25519_ge_p1p1_to_p2 (& Ai_p2 [i ], & t );
860
+ }
861
+ x25519_ge_add (& t , A , & Ai [i ]);
862
+ ge_p1p1_to_cached (& Ai [i + 1 ], & t );
863
+ if (i < 7 ) {
864
+ x25519_ge_p1p1_to_p2 (& Ai_p2 [i + 1 ], & t );
865
+ }
866
+ }
867
+
868
+ ge_p2_0 (r );
869
+ ge_p3 u ;
870
+
871
+ for (i = 0 ; i < 256 ; i += 4 ) {
872
+ ge_p2_dbl (& t , r );
873
+ x25519_ge_p1p1_to_p2 (r , & t );
874
+ ge_p2_dbl (& t , r );
875
+ x25519_ge_p1p1_to_p2 (r , & t );
876
+ ge_p2_dbl (& t , r );
877
+ x25519_ge_p1p1_to_p2 (r , & t );
878
+ ge_p2_dbl (& t , r );
879
+ x25519_ge_p1p1_to_p3 (& u , & t );
880
+
881
+ uint8_t index = scalar [31 - i /8 ];
882
+ index >>= 4 - (i & 4 );
883
+ index &= 0xf ;
884
+
885
+ unsigned j ;
886
+ ge_cached selected ;
887
+ ge_cached_0 (& selected );
888
+ for (j = 0 ; j < 16 ; j ++ ) {
889
+ cmov_cached (& selected , & Ai [j ], 1 & constant_time_eq_w (index , j ));
890
+ }
891
+
892
+ x25519_ge_add (& t , & u , & selected );
893
+ x25519_ge_p1p1_to_p2 (r , & t );
894
+ }
895
+ }
896
+
832
897
static void slide (signed char * r , const uint8_t * a ) {
833
898
int i ;
834
899
int b ;
0 commit comments