|
2 | 2 | // Use of this source code is governed by a BSD-style
|
3 | 3 | // license that can be found in the LICENSE file.
|
4 | 4 |
|
5 |
| -#define Ln2Hi 6.93147180369123816490e-01 |
6 |
| -#define Ln2Lo 1.90821492927058770002e-10 |
7 |
| -#define Log2e 1.44269504088896338700e+00 |
8 |
| -#define Overflow 7.09782712893383973096e+02 |
9 |
| -#define Underflow -7.45133219101941108420e+02 |
10 |
| -#define Overflow2 1.0239999999999999e+03 |
| 5 | +#define Ln2Hi 6.9313812256e-01 |
| 6 | +#define Ln2Lo 9.0580006145e-06 |
| 7 | +#define Log2e 1.4426950216e+00 |
| 8 | +#define Overflow 7.097827e+02 |
| 9 | +#define Underflow -7.451332e+02 |
| 10 | +#define Overflow2 1.024000e+03 |
11 | 11 | #define Underflow2 -1.0740e+03
|
12 |
| -#define NearZero 0x3e30000000000000 // 2**-28 |
13 |
| -#define PosInf 0x7ff0000000000000 |
14 |
| -#define FracMask 0x000fffffffffffff |
15 |
| -#define C1 0x3cb0000000000000 // 2**-52 |
16 |
| -#define P1 1.66666666666666657415e-01 // 0x3FC55555; 0x55555555 |
17 |
| -#define P2 -2.77777777770155933842e-03 // 0xBF66C16C; 0x16BEBD93 |
18 |
| -#define P3 6.61375632143793436117e-05 // 0x3F11566A; 0xAF25DE2C |
19 |
| -#define P4 -1.65339022054652515390e-06 // 0xBEBBBD41; 0xC5D26BF1 |
20 |
| -#define P5 4.13813679705723846039e-08 // 0x3E663769; 0x72BEA4D0 |
| 12 | +#define NearZero 0x317fffff // 2**-28 |
| 13 | +#define PosInf 0x7f800000 |
| 14 | +#define FracMask 0x07fffff |
| 15 | +#define C1 0x34000000 // 2**-23 |
| 16 | +#define P1 1.6666667163e-01 // 0x3FC55555; 0x55555555 |
| 17 | +#define P2 -2.7777778450e-03 // 0xBF66C16C; 0x16BEBD93 |
| 18 | +#define P3 6.6137559770e-05 // 0x3F11566A; 0xAF25DE2C |
| 19 | +#define P4 -1.6533901999e-06 // 0xBEBBBD41; 0xC5D26BF1 |
| 20 | +#define P5 4.1381369442e-08 // 0x3E663769; 0x72BEA4D0 |
21 | 21 |
|
22 | 22 | // Exp returns e**x, the base-e exponential of x.
|
23 | 23 | // This is an assembly implementation of the method used for function Exp in file exp.go.
|
24 | 24 | //
|
25 |
| -// func Exp(x float64) float64 |
| 25 | +// func Exp(x float32) float32 |
26 | 26 | TEXT ·Exp(SB),$0-16
|
27 |
| - FMOVD x+0(FP), F0 // F0 = x |
28 |
| - FCMPD F0, F0 |
| 27 | + FMOVS x+0(FP), F0 // F0 = x |
| 28 | + FCMPS F0, F0 |
29 | 29 | BNE isNaN // x = NaN, return NaN
|
30 |
| - FMOVD $Overflow, F1 |
31 |
| - FCMPD F1, F0 |
| 30 | + FMOVS $Overflow, F1 |
| 31 | + FCMPS F1, F0 |
32 | 32 | BGT overflow // x > Overflow, return PosInf
|
33 |
| - FMOVD $Underflow, F1 |
34 |
| - FCMPD F1, F0 |
| 33 | + FMOVS $Underflow, F1 |
| 34 | + FCMPS F1, F0 |
35 | 35 | BLT underflow // x < Underflow, return 0
|
36 |
| - MOVD $NearZero, R0 |
37 |
| - FMOVD R0, F2 |
38 |
| - FABSD F0, F3 |
39 |
| - FMOVD $1.0, F1 // F1 = 1.0 |
40 |
| - FCMPD F2, F3 |
| 36 | + MOVW $NearZero, R0 |
| 37 | + FMOVS R0, F2 |
| 38 | + FABSS F0, F3 |
| 39 | + FMOVS $1.0, F1 // F1 = 1.0 |
| 40 | + FCMPS F2, F3 |
41 | 41 | BLT nearzero // fabs(x) < NearZero, return 1 + x
|
42 | 42 | // argument reduction, x = k*ln2 + r, |r| <= 0.5*ln2
|
43 | 43 | // computed as r = hi - lo for extra precision.
|
44 |
| - FMOVD $Log2e, F2 |
45 |
| - FMOVD $0.5, F3 |
46 |
| - FNMSUBD F0, F3, F2, F4 // Log2e*x - 0.5 |
47 |
| - FMADDD F0, F3, F2, F3 // Log2e*x + 0.5 |
48 |
| - FCMPD $0.0, F0 |
49 |
| - FCSELD LT, F4, F3, F3 // F3 = k |
50 |
| - FCVTZSD F3, R1 // R1 = int(k) |
51 |
| - SCVTFD R1, F3 // F3 = float64(int(k)) |
52 |
| - FMOVD $Ln2Hi, F4 // F4 = Ln2Hi |
53 |
| - FMOVD $Ln2Lo, F5 // F5 = Ln2Lo |
54 |
| - FMSUBD F3, F0, F4, F4 // F4 = hi = x - float64(int(k))*Ln2Hi |
55 |
| - FMULD F3, F5 // F5 = lo = float64(int(k)) * Ln2Lo |
56 |
| - FSUBD F5, F4, F6 // F6 = r = hi - lo |
57 |
| - FMULD F6, F6, F7 // F7 = t = r * r |
| 44 | + FMOVS $Log2e, F2 |
| 45 | + FMOVS $0.5, F3 |
| 46 | + FNMSUBS F0, F3, F2, F4 // Log2e*x - 0.5 |
| 47 | + FMADDS F0, F3, F2, F3 // Log2e*x + 0.5 |
| 48 | + FCMPS $0.0, F0 |
| 49 | + FCSELS LT, F4, F3, F3 // F3 = k |
| 50 | + FCVTZSS F3, R1 // R1 = int(k) |
| 51 | + SCVTFS R1, F3 // F3 = float32(int(k)) |
| 52 | + FMOVS $Ln2Hi, F4 // F4 = Ln2Hi |
| 53 | + FMOVS $Ln2Lo, F5 // F5 = Ln2Lo |
| 54 | + FMSUBS F3, F0, F4, F4 // F4 = hi = x - float32(int(k))*Ln2Hi |
| 55 | + FMULS F3, F5 // F5 = lo = float32(int(k)) * Ln2Lo |
| 56 | + FSUBS F5, F4, F6 // F6 = r = hi - lo |
| 57 | + FMULS F6, F6, F7 // F7 = t = r * r |
58 | 58 | // compute y
|
59 |
| - FMOVD $P5, F8 // F8 = P5 |
60 |
| - FMOVD $P4, F9 // F9 = P4 |
61 |
| - FMADDD F7, F9, F8, F13 // P4+t*P5 |
62 |
| - FMOVD $P3, F10 // F10 = P3 |
63 |
| - FMADDD F7, F10, F13, F13 // P3+t*(P4+t*P5) |
64 |
| - FMOVD $P2, F11 // F11 = P2 |
65 |
| - FMADDD F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5)) |
66 |
| - FMOVD $P1, F12 // F12 = P1 |
67 |
| - FMADDD F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5))) |
68 |
| - FMSUBD F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))) |
69 |
| - FMOVD $2.0, F14 |
70 |
| - FSUBD F13, F14 |
71 |
| - FMULD F6, F13, F15 |
72 |
| - FDIVD F14, F15 // F15 = (r*c)/(2-c) |
73 |
| - FSUBD F15, F5, F15 // lo-(r*c)/(2-c) |
74 |
| - FSUBD F4, F15, F15 // (lo-(r*c)/(2-c))-hi |
75 |
| - FSUBD F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi) |
| 59 | + FMOVS $P5, F8 // F8 = P5 |
| 60 | + FMOVS $P4, F9 // F9 = P4 |
| 61 | + FMADDS F7, F9, F8, F13 // P4+t*P5 |
| 62 | + FMOVS $P3, F10 // F10 = P3 |
| 63 | + FMADDS F7, F10, F13, F13 // P3+t*(P4+t*P5) |
| 64 | + FMOVS $P2, F11 // F11 = P2 |
| 65 | + FMADDS F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5)) |
| 66 | + FMOVS $P1, F12 // F12 = P1 |
| 67 | + FMADDS F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5))) |
| 68 | + FMSUBS F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))) |
| 69 | + FMOVS $2.0, F14 |
| 70 | + FSUBS F13, F14 |
| 71 | + FMULS F6, F13, F15 |
| 72 | + FDIVS F14, F15 // F15 = (r*c)/(2-c) |
| 73 | + FSUBS F15, F5, F15 // lo-(r*c)/(2-c) |
| 74 | + FSUBS F4, F15, F15 // (lo-(r*c)/(2-c))-hi |
| 75 | + FSUBS F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi) |
76 | 76 | // inline Ldexp(y, k), benefit:
|
77 | 77 | // 1, no parameter pass overhead.
|
78 | 78 | // 2, skip unnecessary checks for Inf/NaN/Zero
|
79 |
| - FMOVD F16, R0 |
80 |
| - AND $FracMask, R0, R2 // fraction |
81 |
| - LSR $52, R0, R5 // exponent |
82 |
| - ADD R1, R5 // R1 = int(k) |
83 |
| - CMP $1, R5 |
| 79 | + FMOVS F16, R0 |
| 80 | + ANDS $FracMask, R0, R2 // fraction |
| 81 | + LSRW $23, R0, R5 // exponent |
| 82 | + ADDS R1, R5 // R1 = int(k) |
| 83 | + CMPW $1, R5 |
84 | 84 | BGE normal
|
85 |
| - ADD $52, R5 // denormal |
86 |
| - MOVD $C1, R8 |
87 |
| - FMOVD R8, F1 // m = 2**-52 |
| 85 | + ADDS $23, R5 // denormal |
| 86 | + MOVW $C1, R8 |
| 87 | + FMOVS R8, F1 // m = 2**-23 |
88 | 88 | normal:
|
89 |
| - ORR R5<<52, R2, R0 |
90 |
| - FMOVD R0, F0 |
91 |
| - FMULD F1, F0 // return m * x |
92 |
| - FMOVD F0, ret+8(FP) |
| 89 | + ORRW R5<<23, R2, R0 |
| 90 | + FMOVS R0, F0 |
| 91 | + FMULS F1, F0 // return m * x |
| 92 | + FMOVS F0, ret+8(FP) |
93 | 93 | RET
|
94 | 94 | nearzero:
|
95 |
| - FADDD F1, F0 |
| 95 | + FADDS F1, F0 |
96 | 96 | isNaN:
|
97 |
| - FMOVD F0, ret+8(FP) |
| 97 | + FMOVS F0, ret+8(FP) |
98 | 98 | RET
|
99 | 99 | underflow:
|
100 |
| - MOVD ZR, ret+8(FP) |
| 100 | + MOVW ZR, ret+8(FP) |
101 | 101 | RET
|
102 | 102 | overflow:
|
103 |
| - MOVD $PosInf, R0 |
104 |
| - MOVD R0, ret+8(FP) |
| 103 | + MOVW $PosInf, R0 |
| 104 | + MOVW R0, ret+8(FP) |
105 | 105 | RET
|
106 | 106 |
|
107 | 107 |
|
108 | 108 | // Exp2 returns 2**x, the base-2 exponential of x.
|
109 | 109 | // This is an assembly implementation of the method used for function Exp2 in file exp.go.
|
110 | 110 | //
|
111 |
| -// func Exp2(x float64) float64 |
| 111 | +// func Exp2(x float32) float32 |
112 | 112 | TEXT ·Exp2(SB),$0-16
|
113 |
| - FMOVD x+0(FP), F0 // F0 = x |
114 |
| - FCMPD F0, F0 |
| 113 | + FMOVS x+0(FP), F0 // F0 = x |
| 114 | + FCMPS F0, F0 |
115 | 115 | BNE isNaN // x = NaN, return NaN
|
116 |
| - FMOVD $Overflow2, F1 |
117 |
| - FCMPD F1, F0 |
| 116 | + FMOVS $Overflow2, F1 |
| 117 | + FCMPS F1, F0 |
118 | 118 | BGT overflow // x > Overflow, return PosInf
|
119 |
| - FMOVD $Underflow2, F1 |
120 |
| - FCMPD F1, F0 |
| 119 | + FMOVS $Underflow2, F1 |
| 120 | + FCMPS F1, F0 |
121 | 121 | BLT underflow // x < Underflow, return 0
|
122 | 122 | // argument reduction; x = r*lg(e) + k with |r| <= ln(2)/2
|
123 | 123 | // computed as r = hi - lo for extra precision.
|
124 |
| - FMOVD $0.5, F2 |
125 |
| - FSUBD F2, F0, F3 // x + 0.5 |
126 |
| - FADDD F2, F0, F4 // x - 0.5 |
127 |
| - FCMPD $0.0, F0 |
128 |
| - FCSELD LT, F3, F4, F3 // F3 = k |
129 |
| - FCVTZSD F3, R1 // R1 = int(k) |
130 |
| - SCVTFD R1, F3 // F3 = float64(int(k)) |
131 |
| - FSUBD F3, F0, F3 // t = x - float64(int(k)) |
132 |
| - FMOVD $Ln2Hi, F4 // F4 = Ln2Hi |
133 |
| - FMOVD $Ln2Lo, F5 // F5 = Ln2Lo |
134 |
| - FMULD F3, F4 // F4 = hi = t * Ln2Hi |
135 |
| - FNMULD F3, F5 // F5 = lo = -t * Ln2Lo |
136 |
| - FSUBD F5, F4, F6 // F6 = r = hi - lo |
137 |
| - FMULD F6, F6, F7 // F7 = t = r * r |
| 124 | + FMOVS $0.5, F2 |
| 125 | + FSUBS F2, F0, F3 // x + 0.5 |
| 126 | + FADDS F2, F0, F4 // x - 0.5 |
| 127 | + FCMPS $0.0, F0 |
| 128 | + FCSELS LT, F3, F4, F3 // F3 = k |
| 129 | + FCVTZSS F3, R1 // R1 = int(k) |
| 130 | + SCVTFS R1, F3 // F3 = float32(int(k)) |
| 131 | + FSUBS F3, F0, F3 // t = x - float32(int(k)) |
| 132 | + FMOVS $Ln2Hi, F4 // F4 = Ln2Hi |
| 133 | + FMOVS $Ln2Lo, F5 // F5 = Ln2Lo |
| 134 | + FMULS F3, F4 // F4 = hi = t * Ln2Hi |
| 135 | + FNMULS F3, F5 // F5 = lo = -t * Ln2Lo |
| 136 | + FSUBS F5, F4, F6 // F6 = r = hi - lo |
| 137 | + FMULS F6, F6, F7 // F7 = t = r * r |
138 | 138 | // compute y
|
139 |
| - FMOVD $P5, F8 // F8 = P5 |
140 |
| - FMOVD $P4, F9 // F9 = P4 |
141 |
| - FMADDD F7, F9, F8, F13 // P4+t*P5 |
142 |
| - FMOVD $P3, F10 // F10 = P3 |
143 |
| - FMADDD F7, F10, F13, F13 // P3+t*(P4+t*P5) |
144 |
| - FMOVD $P2, F11 // F11 = P2 |
145 |
| - FMADDD F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5)) |
146 |
| - FMOVD $P1, F12 // F12 = P1 |
147 |
| - FMADDD F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5))) |
148 |
| - FMSUBD F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))) |
149 |
| - FMOVD $2.0, F14 |
150 |
| - FSUBD F13, F14 |
151 |
| - FMULD F6, F13, F15 |
152 |
| - FDIVD F14, F15 // F15 = (r*c)/(2-c) |
153 |
| - FMOVD $1.0, F1 // F1 = 1.0 |
154 |
| - FSUBD F15, F5, F15 // lo-(r*c)/(2-c) |
155 |
| - FSUBD F4, F15, F15 // (lo-(r*c)/(2-c))-hi |
156 |
| - FSUBD F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi) |
| 139 | + FMOVS $P5, F8 // F8 = P5 |
| 140 | + FMOVS $P4, F9 // F9 = P4 |
| 141 | + FMADDS F7, F9, F8, F13 // P4+t*P5 |
| 142 | + FMOVS $P3, F10 // F10 = P3 |
| 143 | + FMADDS F7, F10, F13, F13 // P3+t*(P4+t*P5) |
| 144 | + FMOVS $P2, F11 // F11 = P2 |
| 145 | + FMADDS F7, F11, F13, F13 // P2+t*(P3+t*(P4+t*P5)) |
| 146 | + FMOVS $P1, F12 // F12 = P1 |
| 147 | + FMADDS F7, F12, F13, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5))) |
| 148 | + FMSUBS F7, F6, F13, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))) |
| 149 | + FMOVS $2.0, F14 |
| 150 | + FSUBS F13, F14 |
| 151 | + FMULS F6, F13, F15 |
| 152 | + FDIVS F14, F15 // F15 = (r*c)/(2-c) |
| 153 | + FMOVS $1.0, F1 // F1 = 1.0 |
| 154 | + FSUBS F15, F5, F15 // lo-(r*c)/(2-c) |
| 155 | + FSUBS F4, F15, F15 // (lo-(r*c)/(2-c))-hi |
| 156 | + FSUBS F15, F1, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi) |
157 | 157 | // inline Ldexp(y, k), benefit:
|
158 | 158 | // 1, no parameter pass overhead.
|
159 | 159 | // 2, skip unnecessary checks for Inf/NaN/Zero
|
160 |
| - FMOVD F16, R0 |
161 |
| - AND $FracMask, R0, R2 // fraction |
162 |
| - LSR $52, R0, R5 // exponent |
163 |
| - ADD R1, R5 // R1 = int(k) |
164 |
| - CMP $1, R5 |
| 160 | + FMOVS F16, R0 |
| 161 | + ANDS $FracMask, R0, R2 // fraction |
| 162 | + LSRW $23, R0, R5 // exponent |
| 163 | + ADDS R1, R5 // R1 = int(k) |
| 164 | + CMPW $1, R5 |
165 | 165 | BGE normal
|
166 |
| - ADD $52, R5 // denormal |
167 |
| - MOVD $C1, R8 |
168 |
| - FMOVD R8, F1 // m = 2**-52 |
| 166 | + ADDS $23, R5 // denormal |
| 167 | + MOVW $C1, R8 |
| 168 | + FMOVS R8, F1 // m = 2**-52 |
169 | 169 | normal:
|
170 |
| - ORR R5<<52, R2, R0 |
171 |
| - FMOVD R0, F0 |
172 |
| - FMULD F1, F0 // return m * x |
| 170 | + ORRW R5<<23, R2, R0 |
| 171 | + FMOVS R0, F0 |
| 172 | + FMULS F1, F0 // return m * x |
173 | 173 | isNaN:
|
174 |
| - FMOVD F0, ret+8(FP) |
| 174 | + FMOVS F0, ret+8(FP) |
175 | 175 | RET
|
176 | 176 | underflow:
|
177 |
| - MOVD ZR, ret+8(FP) |
| 177 | + MOVW ZR, ret+8(FP) |
178 | 178 | RET
|
179 | 179 | overflow:
|
180 |
| - MOVD $PosInf, R0 |
181 |
| - MOVD R0, ret+8(FP) |
| 180 | + MOVW $PosInf, R0 |
| 181 | + MOVW R0, ret+8(FP) |
182 | 182 | RET
|
0 commit comments