Skip to content

Commit a3549c8

Browse files
authored
Merge pull request #17 from dcu/tmp-fix-exp
Fix for Exp operation in arm64
2 parents db8bbc4 + 3d88623 commit a3549c8

File tree

5 files changed

+156
-153
lines changed

5 files changed

+156
-153
lines changed

all_test.go

+20-20
Original file line numberDiff line numberDiff line change
@@ -2439,26 +2439,26 @@ func TestSinh(t *testing.T) {
24392439
}
24402440
}
24412441

2442-
// func TestSqrt(t *testing.T) {
2443-
// for i := 0; i < len(vf); i++ {
2444-
// a := Abs(vf[i])
2445-
// if f := SqrtGo(a); sqrt[i] != f {
2446-
// t.Errorf("SqrtGo(%g) = %g, want %g", a, f, sqrt[i])
2447-
// }
2448-
// a = Abs(vf[i])
2449-
// if f := Sqrt(a); sqrt[i] != f {
2450-
// t.Errorf("Sqrt(%g) = %g, want %g", a, f, sqrt[i])
2451-
// }
2452-
// }
2453-
// for i := 0; i < len(vfsqrtSC); i++ {
2454-
// if f := SqrtGo(vfsqrtSC[i]); !alike(sqrtSC[i], f) {
2455-
// t.Errorf("SqrtGo(%g) = %g, want %g", vfsqrtSC[i], f, sqrtSC[i])
2456-
// }
2457-
// if f := Sqrt(vfsqrtSC[i]); !alike(sqrtSC[i], f) {
2458-
// t.Errorf("Sqrt(%g) = %g, want %g", vfsqrtSC[i], f, sqrtSC[i])
2459-
// }
2460-
// }
2461-
// }
2442+
func TestSqrt(t *testing.T) {
2443+
for i := 0; i < len(vf); i++ {
2444+
a := Abs(vf[i])
2445+
if f := SqrtGo(a); sqrt[i] != f {
2446+
t.Errorf("SqrtGo(%g) = %g, want %g", a, f, sqrt[i])
2447+
}
2448+
a = Abs(vf[i])
2449+
if f := Sqrt(a); sqrt[i] != f {
2450+
t.Errorf("Sqrt(%g) = %g, want %g", a, f, sqrt[i])
2451+
}
2452+
}
2453+
for i := 0; i < len(vfsqrtSC); i++ {
2454+
if f := SqrtGo(vfsqrtSC[i]); !alike(sqrtSC[i], f) {
2455+
t.Errorf("SqrtGo(%g) = %g, want %g", vfsqrtSC[i], f, sqrtSC[i])
2456+
}
2457+
if f := Sqrt(vfsqrtSC[i]); !alike(sqrtSC[i], f) {
2458+
t.Errorf("Sqrt(%g) = %g, want %g", vfsqrtSC[i], f, sqrtSC[i])
2459+
}
2460+
}
2461+
}
24622462

24632463
func TestTan(t *testing.T) {
24642464
for i := 0; i < len(vf); i++ {

exp_amd64.s

+3
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,6 @@ overflow: // return +Inf
108108
notNegInf: // NaN or +Inf, return x
109109
MOVL BX, ret+8(FP)
110110
RET
111+
112+
TEXT ·Exp2(SB),NOSPLIT,$0
113+
JMP ·exp2(SB)

exp_arm64.s

+129-129
Original file line numberDiff line numberDiff line change
@@ -2,181 +2,181 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
#define Ln2Hi 6.93147180369123816490e-01
6-
#define Ln2Lo 1.90821492927058770002e-10
7-
#define Log2e 1.44269504088896338700e+00
8-
#define Overflow 7.09782712893383973096e+02
9-
#define Underflow -7.45133219101941108420e+02
10-
#define Overflow2 1.0239999999999999e+03
5+
#define Ln2Hi 6.9313812256e-01
6+
#define Ln2Lo 9.0580006145e-06
7+
#define Log2e 1.4426950216e+00
8+
#define Overflow 7.097827e+02
9+
#define Underflow -7.451332e+02
10+
#define Overflow2 1.024000e+03
1111
#define Underflow2 -1.0740e+03
12-
#define NearZero 0x3e30000000000000 // 2**-28
13-
#define PosInf 0x7ff0000000000000
14-
#define FracMask 0x000fffffffffffff
15-
#define C1 0x3cb0000000000000 // 2**-52
16-
#define P1 1.66666666666666657415e-01 // 0x3FC55555; 0x55555555
17-
#define P2 -2.77777777770155933842e-03 // 0xBF66C16C; 0x16BEBD93
18-
#define P3 6.61375632143793436117e-05 // 0x3F11566A; 0xAF25DE2C
19-
#define P4 -1.65339022054652515390e-06 // 0xBEBBBD41; 0xC5D26BF1
20-
#define P5 4.13813679705723846039e-08 // 0x3E663769; 0x72BEA4D0
12+
#define NearZero 0x317fffff // 2**-28
13+
#define PosInf 0x7f800000
14+
#define FracMask 0x07fffff
15+
#define C1 0x34000000 // 2**-23
16+
#define P1 1.6666667163e-01 // 0x3FC55555; 0x55555555
17+
#define P2 -2.7777778450e-03 // 0xBF66C16C; 0x16BEBD93
18+
#define P3 6.6137559770e-05 // 0x3F11566A; 0xAF25DE2C
19+
#define P4 -1.6533901999e-06 // 0xBEBBBD41; 0xC5D26BF1
20+
#define P5 4.1381369442e-08 // 0x3E663769; 0x72BEA4D0
2121

2222
// Exp returns e**x, the base-e exponential of x.
2323
// This is an assembly implementation of the method used for function Exp in file exp.go.
2424
//
25-
// func Exp(x float64) float64
25+
// func Exp(x float32) float32
2626
TEXT ·Exp(SB),$0-16
27-
FMOVD x+0(FP), F0 // F0 = x
28-
FCMPD F0, F0
27+
FMOVS x+0(FP), F0 // F0 = x
28+
FCMPS F0, F0
2929
BNE isNaN // x = NaN, return NaN
30-
FMOVD $Overflow, F1
31-
FCMPD F1, F0
30+
FMOVS $Overflow, F1
31+
FCMPS F1, F0
3232
BGT overflow // x > Overflow, return PosInf
33-
FMOVD $Underflow, F1
34-
FCMPD F1, F0
33+
FMOVS $Underflow, F1
34+
FCMPS F1, F0
3535
BLT underflow // x < Underflow, return 0
36-
MOVD $NearZero, R0
37-
FMOVD R0, F2
38-
FABSD F0, F3
39-
FMOVD $1.0, F1 // F1 = 1.0
40-
FCMPD F2, F3
36+
MOVW $NearZero, R0
37+
FMOVS R0, F2
38+
FABSS F0, F3
39+
FMOVS $1.0, F1 // F1 = 1.0
40+
FCMPS F2, F3
4141
BLT nearzero // fabs(x) < NearZero, return 1 + x
4242
// argument reduction, x = k*ln2 + r, |r| <= 0.5*ln2
4343
// computed as r = hi - lo for extra precision.
44-
FMOVD $Log2e, F2
45-
FMOVD $0.5, F3
46-
FNMSUBD F0, F3, F2, F4 // Log2e*x - 0.5
47-
FMADDD F0, F3, F2, F3 // Log2e*x + 0.5
48-
FCMPD $0.0, F0
49-
FCSELD LT, F4, F3, F3 // F3 = k
50-
FCVTZSD F3, R1 // R1 = int(k)
51-
SCVTFD R1, F3 // F3 = float64(int(k))
52-
FMOVD $Ln2Hi, F4 // F4 = Ln2Hi
53-
FMOVD $Ln2Lo, F5 // F5 = Ln2Lo
54-
FMSUBD F3, F0, F4, F4 // F4 = hi = x - float64(int(k))*Ln2Hi
55-
FMULD F3, F5 // F5 = lo = float64(int(k)) * Ln2Lo
56-
FSUBD F5, F4, F6 // F6 = r = hi - lo
57-
FMULD F6, F6, F7 // F7 = t = r * r
44+
FMOVS $Log2e, F2
45+
FMOVS $0.5, F3
46+
FNMSUBS F0, F3, F2, F4 // Log2e*x - 0.5
47+
FMADDS F0, F3, F2, F3 // Log2e*x + 0.5
48+
FCMPS $0.0, F0
49+
FCSELS LT, F4, F3, F3 // F3 = k
50+
FCVTZSS F3, R1 // R1 = int(k)
51+
SCVTFS R1, F3 // F3 = float32(int(k))
52+
FMOVS $Ln2Hi, F4 // F4 = Ln2Hi
53+
FMOVS $Ln2Lo, F5 // F5 = Ln2Lo
54+
FMSUBS F3, F0, F4, F4 // F4 = hi = x - float32(int(k))*Ln2Hi
55+
FMULS F3, F5 // F5 = lo = float32(int(k)) * Ln2Lo
56+
FSUBS F5, F4, F6 // F6 = r = hi - lo
57+
FMULS F6, F6, F7 // F7 = t = r * r
5858
// compute y
59-
FMOVD $P5, F8 // F8 = P5
60-
FMOVD $P4, F9 // F9 = P4
61-
FMADDD F7, F9, F8, F13 // P4+t*P5
62-
FMOVD $P3, F10 // F10 = P3
63-
FMADDD F7, F10, F13, F13 // P3+t*(P4+t*P5)
64-
FMOVD $P2, F11 // F11 = P2
65-
FMADDD F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5))
66-
FMOVD $P1, F12 // F12 = P1
67-
FMADDD F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5)))
68-
FMSUBD F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
69-
FMOVD $2.0, F14
70-
FSUBD F13, F14
71-
FMULD F6, F13, F15
72-
FDIVD F14, F15 // F15 = (r*c)/(2-c)
73-
FSUBD F15, F5, F15 // lo-(r*c)/(2-c)
74-
FSUBD F4, F15, F15 // (lo-(r*c)/(2-c))-hi
75-
FSUBD F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi)
59+
FMOVS $P5, F8 // F8 = P5
60+
FMOVS $P4, F9 // F9 = P4
61+
FMADDS F7, F9, F8, F13 // P4+t*P5
62+
FMOVS $P3, F10 // F10 = P3
63+
FMADDS F7, F10, F13, F13 // P3+t*(P4+t*P5)
64+
FMOVS $P2, F11 // F11 = P2
65+
FMADDS F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5))
66+
FMOVS $P1, F12 // F12 = P1
67+
FMADDS F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5)))
68+
FMSUBS F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
69+
FMOVS $2.0, F14
70+
FSUBS F13, F14
71+
FMULS F6, F13, F15
72+
FDIVS F14, F15 // F15 = (r*c)/(2-c)
73+
FSUBS F15, F5, F15 // lo-(r*c)/(2-c)
74+
FSUBS F4, F15, F15 // (lo-(r*c)/(2-c))-hi
75+
FSUBS F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi)
7676
// inline Ldexp(y, k), benefit:
7777
// 1, no parameter pass overhead.
7878
// 2, skip unnecessary checks for Inf/NaN/Zero
79-
FMOVD F16, R0
80-
AND $FracMask, R0, R2 // fraction
81-
LSR $52, R0, R5 // exponent
82-
ADD R1, R5 // R1 = int(k)
83-
CMP $1, R5
79+
FMOVS F16, R0
80+
ANDS $FracMask, R0, R2 // fraction
81+
LSRW $23, R0, R5 // exponent
82+
ADDS R1, R5 // R1 = int(k)
83+
CMPW $1, R5
8484
BGE normal
85-
ADD $52, R5 // denormal
86-
MOVD $C1, R8
87-
FMOVD R8, F1 // m = 2**-52
85+
ADDS $23, R5 // denormal
86+
MOVW $C1, R8
87+
FMOVS R8, F1 // m = 2**-23
8888
normal:
89-
ORR R5<<52, R2, R0
90-
FMOVD R0, F0
91-
FMULD F1, F0 // return m * x
92-
FMOVD F0, ret+8(FP)
89+
ORRW R5<<23, R2, R0
90+
FMOVS R0, F0
91+
FMULS F1, F0 // return m * x
92+
FMOVS F0, ret+8(FP)
9393
RET
9494
nearzero:
95-
FADDD F1, F0
95+
FADDS F1, F0
9696
isNaN:
97-
FMOVD F0, ret+8(FP)
97+
FMOVS F0, ret+8(FP)
9898
RET
9999
underflow:
100-
MOVD ZR, ret+8(FP)
100+
MOVW ZR, ret+8(FP)
101101
RET
102102
overflow:
103-
MOVD $PosInf, R0
104-
MOVD R0, ret+8(FP)
103+
MOVW $PosInf, R0
104+
MOVW R0, ret+8(FP)
105105
RET
106106

107107

108108
// Exp2 returns 2**x, the base-2 exponential of x.
109109
// This is an assembly implementation of the method used for function Exp2 in file exp.go.
110110
//
111-
// func Exp2(x float64) float64
111+
// func Exp2(x float32) float32
112112
TEXT ·Exp2(SB),$0-16
113-
FMOVD x+0(FP), F0 // F0 = x
114-
FCMPD F0, F0
113+
FMOVS x+0(FP), F0 // F0 = x
114+
FCMPS F0, F0
115115
BNE isNaN // x = NaN, return NaN
116-
FMOVD $Overflow2, F1
117-
FCMPD F1, F0
116+
FMOVS $Overflow2, F1
117+
FCMPS F1, F0
118118
BGT overflow // x > Overflow, return PosInf
119-
FMOVD $Underflow2, F1
120-
FCMPD F1, F0
119+
FMOVS $Underflow2, F1
120+
FCMPS F1, F0
121121
BLT underflow // x < Underflow, return 0
122122
// argument reduction; x = r*lg(e) + k with |r| <= ln(2)/2
123123
// computed as r = hi - lo for extra precision.
124-
FMOVD $0.5, F2
125-
FSUBD F2, F0, F3 // x + 0.5
126-
FADDD F2, F0, F4 // x - 0.5
127-
FCMPD $0.0, F0
128-
FCSELD LT, F3, F4, F3 // F3 = k
129-
FCVTZSD F3, R1 // R1 = int(k)
130-
SCVTFD R1, F3 // F3 = float64(int(k))
131-
FSUBD F3, F0, F3 // t = x - float64(int(k))
132-
FMOVD $Ln2Hi, F4 // F4 = Ln2Hi
133-
FMOVD $Ln2Lo, F5 // F5 = Ln2Lo
134-
FMULD F3, F4 // F4 = hi = t * Ln2Hi
135-
FNMULD F3, F5 // F5 = lo = -t * Ln2Lo
136-
FSUBD F5, F4, F6 // F6 = r = hi - lo
137-
FMULD F6, F6, F7 // F7 = t = r * r
124+
FMOVS $0.5, F2
125+
FSUBS F2, F0, F3 // x + 0.5
126+
FADDS F2, F0, F4 // x - 0.5
127+
FCMPS $0.0, F0
128+
FCSELS LT, F3, F4, F3 // F3 = k
129+
FCVTZSS F3, R1 // R1 = int(k)
130+
SCVTFS R1, F3 // F3 = float32(int(k))
131+
FSUBS F3, F0, F3 // t = x - float32(int(k))
132+
FMOVS $Ln2Hi, F4 // F4 = Ln2Hi
133+
FMOVS $Ln2Lo, F5 // F5 = Ln2Lo
134+
FMULS F3, F4 // F4 = hi = t * Ln2Hi
135+
FNMULS F3, F5 // F5 = lo = -t * Ln2Lo
136+
FSUBS F5, F4, F6 // F6 = r = hi - lo
137+
FMULS F6, F6, F7 // F7 = t = r * r
138138
// compute y
139-
FMOVD $P5, F8 // F8 = P5
140-
FMOVD $P4, F9 // F9 = P4
141-
FMADDD F7, F9, F8, F13 // P4+t*P5
142-
FMOVD $P3, F10 // F10 = P3
143-
FMADDD F7, F10, F13, F13 // P3+t*(P4+t*P5)
144-
FMOVD $P2, F11 // F11 = P2
145-
FMADDD F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5))
146-
FMOVD $P1, F12 // F12 = P1
147-
FMADDD F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5)))
148-
FMSUBD F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
149-
FMOVD $2.0, F14
150-
FSUBD F13, F14
151-
FMULD F6, F13, F15
152-
FDIVD F14, F15 // F15 = (r*c)/(2-c)
153-
FMOVD $1.0, F1 // F1 = 1.0
154-
FSUBD F15, F5, F15 // lo-(r*c)/(2-c)
155-
FSUBD F4, F15, F15 // (lo-(r*c)/(2-c))-hi
156-
FSUBD F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi)
139+
FMOVS $P5, F8 // F8 = P5
140+
FMOVS $P4, F9 // F9 = P4
141+
FMADDS F7, F9, F8, F13 // P4+t*P5
142+
FMOVS $P3, F10 // F10 = P3
143+
FMADDS F7, F10, F13, F13 // P3+t*(P4+t*P5)
144+
FMOVS $P2, F11 // F11 = P2
145+
FMADDS F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5))
146+
FMOVS $P1, F12 // F12 = P1
147+
FMADDS F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5)))
148+
FMSUBS F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
149+
FMOVS $2.0, F14
150+
FSUBS F13, F14
151+
FMULS F6, F13, F15
152+
FDIVS F14, F15 // F15 = (r*c)/(2-c)
153+
FMOVS $1.0, F1 // F1 = 1.0
154+
FSUBS F15, F5, F15 // lo-(r*c)/(2-c)
155+
FSUBS F4, F15, F15 // (lo-(r*c)/(2-c))-hi
156+
FSUBS F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi)
157157
// inline Ldexp(y, k), benefit:
158158
// 1, no parameter pass overhead.
159159
// 2, skip unnecessary checks for Inf/NaN/Zero
160-
FMOVD F16, R0
161-
AND $FracMask, R0, R2 // fraction
162-
LSR $52, R0, R5 // exponent
163-
ADD R1, R5 // R1 = int(k)
164-
CMP $1, R5
160+
FMOVS F16, R0
161+
ANDS $FracMask, R0, R2 // fraction
162+
LSRW $23, R0, R5 // exponent
163+
ADDS R1, R5 // R1 = int(k)
164+
CMPW $1, R5
165165
BGE normal
166-
ADD $52, R5 // denormal
167-
MOVD $C1, R8
168-
FMOVD R8, F1 // m = 2**-52
166+
ADDS $23, R5 // denormal
167+
MOVW $C1, R8
168+
FMOVS R8, F1 // m = 2**-52
169169
normal:
170-
ORR R5<<52, R2, R0
171-
FMOVD R0, F0
172-
FMULD F1, F0 // return m * x
170+
ORRW R5<<23, R2, R0
171+
FMOVS R0, F0
172+
FMULS F1, F0 // return m * x
173173
isNaN:
174-
FMOVD F0, ret+8(FP)
174+
FMOVS F0, ret+8(FP)
175175
RET
176176
underflow:
177-
MOVD ZR, ret+8(FP)
177+
MOVW ZR, ret+8(FP)
178178
RET
179179
overflow:
180-
MOVD $PosInf, R0
181-
MOVD R0, ret+8(FP)
180+
MOVW $PosInf, R0
181+
MOVW R0, ret+8(FP)
182182
RET

export_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@ package math32
33
// Export internal functions for testing.
44

55
// var Exp2Go = exp2
6-
// var SqrtGo = sqrt
6+
var SqrtGo = sqrt
77
var ExpGo = exp
88
var HypotGo = hypot

sqrt_arm64.s

+3-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
// func Sqrt(x float64) float64
88
TEXT ·Sqrt(SB),NOSPLIT,$0
9-
FMOVD x+0(FP), F0
10-
FSQRTD F0, F0
11-
FMOVD F0, ret+8(FP)
9+
FMOVS x+0(FP), F0
10+
FSQRTS F0, F0
11+
FMOVS F0, ret+8(FP)
1212
RET

0 commit comments

Comments
 (0)