-
Notifications
You must be signed in to change notification settings - Fork 0
/
mulmodmont384.x86_64.S
231 lines (196 loc) · 8.2 KB
/
mulmodmont384.x86_64.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
.globl mulmodmont384
# This is part of the repository: https://github.com/poemm/assembles
# Notes:
# This is a handwritten x86_64 assembly implementation of mulmodmont384
# It is over-abstracted.
# - There could be fewer macros. But the goal is to reuse these macros elsewhere.
# - All registers are parameterized to make it easier to translate to other architecture's assembly.
# Returns x*y*R (modulo mod), where R=2**384, the standard montgomery multiplication.
# Function signature is: void mulmodmont384(uint64_t outptr[6], uint64_t x[6], uint64_t y[6], uint64_t mod[6], uint64_t inv)
# The algorithm is Coarsely Integrated Operand Scanning (CIOS), as described based on in Çetin K. Koç; Tolga Acar; Burton S. Kaliski, Jr. (June 1996). "Analyzing and Comparing Montgomery Multiplication Algorithms". IEEE Micro. 16 (3): 26–33.
.macro second_inner_loop j Aj_1 Aj carry hi lo A0inv mptr
mov \j*8(\mptr), \lo # get y[j] in register for multiply 64x64->128
mul \A0inv # hi,lo = m[j]*A0inv
add \Aj, \lo # add A[j] to low limb, overflow flag may be flipped
adc $0, \hi # add overflow flag to high limb of x[i]*y[j]
add \carry, \lo # add carry to low limb of x[i]*y[j]+A[j]
adc $0, \hi # add overflow flag to high limb
mov \hi, \carry # save high limb to carry limb for next iter
mov \lo, \Aj_1 # save low limb to A[j-1]
.endm
.macro first_inner_loop j Aj carry hi lo xi yptr
mov \j*8(\yptr), \lo # get y[j] in register for multiply 64x64->128
mul \xi # hi,lo = y[j]*x[i]
add \Aj, \lo # add A[j] to low limb, overflow flag may be flipped
adc $0, \hi # add overflow flag to high limb of x[i]*y[j]
add \carry, \lo # add carry to low limb of x[i]*y[j]+A[j]
adc $0, \hi # add overflow flag to high limb
mov \hi, \carry # save high limb to carry limb for next iter
mov \lo, \Aj # save low limb to A[j]
.endm
.macro main_loop i carry hi lo in_ptrs tmp_ptr tmp_val A0 A1 A2 A3 A4 A5 A6 A7
# prepare for first inner loop:
xor \carry, \carry # zero carry limb
mov 0(\in_ptrs), \tmp_ptr # get xptr
mov \i*8(\tmp_ptr), \tmp_val # get x[i]
mov 8(\in_ptrs), \tmp_ptr # get yptr
# first inner loop iterations for j=0,...,5
# arguments are j Aj carry hi lo xi yptr
first_inner_loop 0 \A0 \carry \hi \lo \tmp_val \tmp_ptr
first_inner_loop 1 \A1 \carry \hi \lo \tmp_val \tmp_ptr
first_inner_loop 2 \A2 \carry \hi \lo \tmp_val \tmp_ptr
first_inner_loop 3 \A3 \carry \hi \lo \tmp_val \tmp_ptr
first_inner_loop 4 \A4 \carry \hi \lo \tmp_val \tmp_ptr
first_inner_loop 5 \A5 \carry \hi \lo \tmp_val \tmp_ptr
# finish carrying from first inner loop
xor \A7, \A7 # zero A[7], will take overflow in the following addition bit
add \carry, \A6 # A[6] += carry limb
adc $0, \A7 # A[7] = overflow flag
# prepare for second inner loop
mov 24(\in_ptrs), \tmp_val # get inv
imul \A0, \tmp_val # A0inv = A[0]*inv, overflow can be ignored
mov 16(\in_ptrs), \tmp_ptr # get mptr
# first iter of second inner loop can avoid carry
mov 0(\tmp_ptr), \lo # get m[0]
mul \tmp_val # hi,lo = A0inv*m[0], I think lo is zero no matter what
add \A0, \lo # lo += A0
adc $0, \hi # hi += overflow form previous addition
mov \hi, \carry # carry limb = overflow of A0inv*m[0]
# rest of iters
# arguments are j A[j-1] A[j] carry hi lo A0inv mptr
second_inner_loop 1 \A0 \A1 \carry \hi \lo \tmp_val \tmp_ptr
second_inner_loop 2 \A1 \A2 \carry \hi \lo \tmp_val \tmp_ptr
second_inner_loop 3 \A2 \A3 \carry \hi \lo \tmp_val \tmp_ptr
second_inner_loop 4 \A3 \A4 \carry \hi \lo \tmp_val \tmp_ptr
second_inner_loop 5 \A4 \A5 \carry \hi \lo \tmp_val \tmp_ptr
# finish carrying from second inner loop
add \A6, \carry # prepare for A[5] = A[6]+carry
adc $0, \A7 # add overflow bit to A[7]
mov \carry, \A5 # finish A[5] = A[6]+carry
mov \A7, \A6 # A[6] = A[7]
.endm
.macro mulmodmont384_core a0 a1 a2 a3 a4 a5 r0 r1 r2 r3 r4 r5 r6 r7
# free argument registers by dumping these to output array
mov \a1, 0(\a0) # xptr
mov \a2, 8(\a0) # yptr
mov \a3, 16(\a0) # mptr
mov \a4, 24(\a0) # inv
# init array A to zero
xor \a5, \a5 # A[0] = 0
xor \r1, \r1 # A[1] = 0
xor \r2, \r2 # A[2] = 0
xor \r3, \r3 # A[3] = 0
xor \r4, \r4 # A[4] = 0
xor \r5, \r5 # A[5] = 0
xor \r6, \r6 # A[6] = 0
xor \r7, \r7 # A[7] = 0
# five main loop iterations
# params: i carry hi lo in_ptrs tmp_ptr tmp_val A0 A1 A2 A3 A4 A5 A6 A7
main_loop 0 \a3 \a2 \r0 \a0 \a1 \a4 \a5 \r1 \r2 \r3 \r4 \r5 \r6 \r7
main_loop 1 \a3 \a2 \r0 \a0 \a1 \a4 \a5 \r1 \r2 \r3 \r4 \r5 \r6 \r7
main_loop 2 \a3 \a2 \r0 \a0 \a1 \a4 \a5 \r1 \r2 \r3 \r4 \r5 \r6 \r7
main_loop 3 \a3 \a2 \r0 \a0 \a1 \a4 \a5 \r1 \r2 \r3 \r4 \r5 \r6 \r7
main_loop 4 \a3 \a2 \r0 \a0 \a1 \a4 \a5 \r1 \r2 \r3 \r4 \r5 \r6 \r7
main_loop 5 \a3 \a2 \r0 \a0 \a1 \a4 \a5 \r1 \r2 \r3 \r4 \r5 \r6 \r7
.endm
.macro subtract384_if_leq mptr x0 x1 x2 x3 x4 x5 x6 y0 y1 y2 y3 y4 y5
# check if x<=y, if so subtract x=y-x, otherwise don't subtract
# inputs:
# mptr = pointer to modulus uint64_t m[6]
# x0,x1,x2,x3,x4,x5,r6 = registers containing x, including overflow limb r6
# y0,y1,y2,y3,y4,y5 = registers to bring values at mptr
# move mod into registers y0,y1,...,y5
mov 0(\mptr), \y0 # y0 = mod[0]
mov 8(\mptr), \y1 # y1 = mod[1]
mov 16(\mptr), \y2 # y2 = mod[2]
mov 24(\mptr), \y3 # y3 = mod[3]
mov 32(\mptr), \y4 # y4 = mod[4]
mov 40(\mptr), \y5 # y5 = mod[5]
# special case: overflow limb nonzero, then must subtract
cmp $0, \x6 # check if overflow limb nonzero
jb subtraction
# compare limbs whether x[i]<y[i] or x[i]>y[i] for i=0,1,...,5.
# If x[i]==y[i], then check next limb. If x==y, fall through to subtraction
cmp \x5, \y5 # compare x5 and y5
ja skip_subtraction # x5>y5, so skip subtraction
jb subtraction # from cmp above, x5<x5, so need subtraction
# note: niether x5<y5 nor x5>y5, so x5==y5, so check lower limbs
cmp \x4, \y4 # compare x4 and y4
ja skip_subtraction #
jb subtraction #
cmp \x3, \y3 # compare x3 and y3
ja subtraction #
jb skip_subtraction #
cmp \x2, \y2 # compare x2 and y2
ja subtraction #
jb skip_subtraction #
cmp \x1, \y1 # compare x1 and y1
ja subtraction #
jb skip_subtraction #
cmp \x0, \y0 # compare x0 and y0
ja subtraction #
jb skip_subtraction #
# reaching here means x==y, so fall through to subtraction
subtraction:
# x = x-y
sub \y0, \x0
sbb \y1, \x1
sbb \y2, \x2
sbb \y3, \x3
sbb \y4, \x4
sbb \y5, \x5
skip_subtraction: # do nothing
.endm
.macro mulmodmont384_full a0 a1 a2 a3 a4 a5 r0 r1 r2 r3 r4 r5 r6 r7
# input args
# a0 = outptr
# a1 = xptr
# a2 = yptr
# a3 = mptr
# a4 = inv
# core part
mulmodmont384_core \a0 \a1 \a2 \a3 \a4 \a5 \r0 \r1 \r2 \r3 \r4 \r5 \r6 \r7
# get modulus pointer in preparation final subtraction
mov 16(\a0), \r0 # r0 = mptr
# final subtraction
# parameters are mptr registers with out registers to store mod
subtract384_if_leq \r0 \a5 \r1 \r2 \r3 \r4 \r5 \r6 \a1 \a2 \a3 \a4 \r6 \r7
# write result to out
movq \a5, 0(\a0) # write result for this 64-bit limb
movq \r1, 8(\a0) # write result for this 64-bit limb
movq \r2, 16(\a0) # write result for this 64-bit limb
movq \r3, 24(\a0) # write result for this 64-bit limb
movq \r4, 32(\a0) # write result for this 64-bit limb
movq \r5, 40(\a0) # write result for this 64-bit limb
.endm
.text
mulmodmont384:
# following C calling conventions, save registers
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
# Don't need to save stack ptr since we don't use it
# note: uncomment if stack pointer is needed to handle interrupts
#movq %rsp, stack_pointer_saved_in_data_section
# input args:
# rdi = outptr
# rsi = xptr
# rdx = yptr
# rcx = mptr
# r8 = inv
# parameters are a0 a1 a2 a3 a4 a5 r0 r1 r2 r3 r4 r5 r6 r7
mulmodmont384_full %rdi %rsi %rdx %rcx %r8 %r9 %rax %rbx %r10 %r11 %r12 %r13 %r14 %r15
# Don't need to recover the stack pointer
#movq stack_pointer_saved_in_data_section, %rsp
# as callee, return saved registers to original
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
# finally, return
ret