@@ -60,6 +60,7 @@ def test_write_3x3_depthwise_code():
6060 """
6161 #ifndef TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
6262 #define TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
63+ #include <arm_acle.h>
6364 __attribute__((always_inline)) static inline int tensordot_opt_x1_int16_w48_3x3_000(
6465 int *output, int *tensor, int *kernel, int *bias, int *scale
6566 ) {
@@ -78,18 +79,18 @@ def test_write_3x3_depthwise_code():
7879 int kernel__y02_x00__y02_x01 = kernel[3];
7980 int kernel__y02_x02__unknown = kernel[4];
8081
81- sum_0 = __builtin_arm_smlad (tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
82- sum_0 = __builtin_arm_smlabb (tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, sum_0);
83- sum_0 = __builtin_arm_smlatb(kernel__y00_x02__y01_x00, tensor__y01_x00__y01_x01 , sum_0);
84- sum_0 = __builtin_arm_smlatb (tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, sum_0);
85- sum_0 = __builtin_arm_smlatb(kernel__y01_x01__y01_x02, tensor__y01_x02__unknown , sum_0);
86- sum_0 = __builtin_arm_smlad (tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
87- sum_0 = __builtin_arm_smlabb (tensor__y02_x02__unknown, kernel__y02_x02__unknown, sum_0);
82+ sum_0 = __smlad (tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
83+ sum_0 = __smlabb (tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, sum_0);
84+ sum_0 = __smlabt(tensor__y01_x00__y01_x01, kernel__y00_x02__y01_x00 , sum_0);
85+ sum_0 = __smlatb (tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, sum_0);
86+ sum_0 = __smlabt(tensor__y01_x02__unknown, kernel__y01_x01__y01_x02 , sum_0);
87+ sum_0 = __smlad (tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
88+ sum_0 = __smlabb (tensor__y02_x02__unknown, kernel__y02_x02__unknown, sum_0);
8889
8990 int scale_val = *scale;
9091 int requant_0 = (sum_0 * (long long) scale_val) >> 32;
9192 requant_0 = (requant_0 + 1) >> 1;
92- requant_0 = __builtin_arm_ssat (requant_0 + -128, 8);
93+ requant_0 = __ssat (requant_0 + -128, 8);
9394
9495 ((short*) output)[0] = (short) requant_0;
9596 return 0;
@@ -112,6 +113,7 @@ def test_odd_width_3x3_depthwise_strides_code():
112113 """
113114 #ifndef TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
114115 #define TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
116+ #include <arm_acle.h>
115117 __attribute__((always_inline)) static inline int tensordot_opt_x2_int16_w49_3x3_000_2_4(
116118 int *output, int *tensor, int *kernel, int *bias, int *scale
117119 ) {
@@ -133,26 +135,26 @@ def test_odd_width_3x3_depthwise_strides_code():
133135 int kernel__y02_x00__y02_x01 = kernel[3];
134136 int kernel__y02_x02__unknown = kernel[4];
135137
136- sum_0 = __builtin_arm_smlad (tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
137- sum_0 = __builtin_arm_smlabb (tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, sum_0);
138- sum_0 = __builtin_arm_smlatt (tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, sum_0);
139- sum_0 = __builtin_arm_smlad (tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, sum_0);
140- sum_0 = __builtin_arm_smlad (tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
141- sum_0 = __builtin_arm_smlabb (tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, sum_0);
142- sum_1 = __builtin_arm_smlad (tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, sum_1);
143- sum_1 = __builtin_arm_smlabb (tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, sum_1);
144- sum_1 = __builtin_arm_smlatt (tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, sum_1);
145- sum_1 = __builtin_arm_smlad (tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, sum_1);
146- sum_1 = __builtin_arm_smlad (tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, sum_1);
147- sum_1 = __builtin_arm_smlabb (tensor__y02_x04__unknown, kernel__y02_x02__unknown, sum_1);
138+ sum_0 = __smlad (tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
139+ sum_0 = __smlabb (tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, sum_0);
140+ sum_0 = __smlatt (tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, sum_0);
141+ sum_0 = __smlad (tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, sum_0);
142+ sum_0 = __smlad (tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
143+ sum_0 = __smlabb (tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, sum_0);
144+ sum_1 = __smlad (tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, sum_1);
145+ sum_1 = __smlabb (tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, sum_1);
146+ sum_1 = __smlatt (tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, sum_1);
147+ sum_1 = __smlad (tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, sum_1);
148+ sum_1 = __smlad (tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, sum_1);
149+ sum_1 = __smlabb (tensor__y02_x04__unknown, kernel__y02_x02__unknown, sum_1);
148150
149151 int scale_val = *scale;
150152 int requant_0 = (sum_0 * (long long) scale_val) >> 32;
151153 requant_0 = (requant_0 + 1) >> 1;
152- requant_0 = __builtin_arm_ssat (requant_0 + -128, 8);
154+ requant_0 = __ssat (requant_0 + -128, 8);
153155 int requant_1 = (sum_1 * (long long) scale_val) >> 32;
154156 requant_1 = (requant_1 + 1) >> 1;
155- requant_1 = __builtin_arm_ssat (requant_1 + -128, 8);
157+ requant_1 = __ssat (requant_1 + -128, 8);
156158
157159 ((short*) output)[0] = (short) requant_0;
158160 ((short*) output)[4] = (short) requant_1;
@@ -174,6 +176,7 @@ def test_1x1x8_convolution_code():
174176 """
175177 #ifndef TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
176178 #define TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
179+ #include <arm_acle.h>
177180 __attribute__((always_inline)) static inline int tensordot_opt_x4_int16_w384_1x8_000_8_1(
178181 int *output, int *tensor, int *kernel, int *bias, int *scale
179182 ) {
@@ -201,36 +204,36 @@ def test_1x1x8_convolution_code():
201204 int kernel__y00_x04__y00_x05 = kernel[2];
202205 int kernel__y00_x06__y00_x07 = kernel[3];
203206
204- sum_0 = __builtin_arm_smlad (tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
205- sum_0 = __builtin_arm_smlad (tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, sum_0);
206- sum_0 = __builtin_arm_smlad (tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, sum_0);
207- sum_0 = __builtin_arm_smlad (tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, sum_0);
208- sum_1 = __builtin_arm_smlad (tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, sum_1);
209- sum_1 = __builtin_arm_smlad (tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, sum_1);
210- sum_1 = __builtin_arm_smlad (tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, sum_1);
211- sum_1 = __builtin_arm_smlad (tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, sum_1);
212- sum_2 = __builtin_arm_smlad (tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, sum_2);
213- sum_2 = __builtin_arm_smlad (tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, sum_2);
214- sum_2 = __builtin_arm_smlad (tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, sum_2);
215- sum_2 = __builtin_arm_smlad (tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, sum_2);
216- sum_3 = __builtin_arm_smlad (tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, sum_3);
217- sum_3 = __builtin_arm_smlad (tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, sum_3);
218- sum_3 = __builtin_arm_smlad (tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, sum_3);
219- sum_3 = __builtin_arm_smlad (tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, sum_3);
207+ sum_0 = __smlad (tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
208+ sum_0 = __smlad (tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, sum_0);
209+ sum_0 = __smlad (tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, sum_0);
210+ sum_0 = __smlad (tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, sum_0);
211+ sum_1 = __smlad (tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, sum_1);
212+ sum_1 = __smlad (tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, sum_1);
213+ sum_1 = __smlad (tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, sum_1);
214+ sum_1 = __smlad (tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, sum_1);
215+ sum_2 = __smlad (tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, sum_2);
216+ sum_2 = __smlad (tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, sum_2);
217+ sum_2 = __smlad (tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, sum_2);
218+ sum_2 = __smlad (tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, sum_2);
219+ sum_3 = __smlad (tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, sum_3);
220+ sum_3 = __smlad (tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, sum_3);
221+ sum_3 = __smlad (tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, sum_3);
222+ sum_3 = __smlad (tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, sum_3);
220223
221224 int scale_val = *scale;
222225 int requant_0 = (sum_0 * (long long) scale_val) >> 32;
223226 requant_0 = (requant_0 + 1) >> 1;
224- requant_0 = __builtin_arm_ssat (requant_0 + -128, 8);
227+ requant_0 = __ssat (requant_0 + -128, 8);
225228 int requant_1 = (sum_1 * (long long) scale_val) >> 32;
226229 requant_1 = (requant_1 + 1) >> 1;
227- requant_1 = __builtin_arm_ssat (requant_1 + -128, 8);
230+ requant_1 = __ssat (requant_1 + -128, 8);
228231 int requant_2 = (sum_2 * (long long) scale_val) >> 32;
229232 requant_2 = (requant_2 + 1) >> 1;
230- requant_2 = __builtin_arm_ssat (requant_2 + -128, 8);
233+ requant_2 = __ssat (requant_2 + -128, 8);
231234 int requant_3 = (sum_3 * (long long) scale_val) >> 32;
232235 requant_3 = (requant_3 + 1) >> 1;
233- requant_3 = __builtin_arm_ssat (requant_3 + -128, 8);
236+ requant_3 = __ssat (requant_3 + -128, 8);
234237
235238 int packed_res_0 = requant_0 + (requant_1 << 16);
236239 int packed_res_1 = requant_2 + (requant_3 << 16);
@@ -269,6 +272,7 @@ def test_3x3x3_offset_convolution_code():
269272 """
270273 #ifndef TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
271274 #define TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
275+ #include <arm_acle.h>
272276 __attribute__((always_inline)) static inline int tensordot_opt_x1_int16_w288_3x9_111(
273277 int *output, int *tensor, int *kernel, int *bias, int *scale
274278 ) {
@@ -305,30 +309,30 @@ def test_3x3x3_offset_convolution_code():
305309 int kernel__y02_x05__y02_x06 = kernel[12];
306310 int kernel__y02_x07__y02_x08 = kernel[13];
307311
308- sum_0 = __builtin_arm_smlatt (tensor__unknown__y00_x00, kernel__unknown__y00_x00, sum_0);
309- sum_0 = __builtin_arm_smlad (tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, sum_0);
310- sum_0 = __builtin_arm_smlad (tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, sum_0);
311- sum_0 = __builtin_arm_smlad (tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, sum_0);
312- sum_0 = __builtin_arm_smlad (tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, sum_0);
313- sum_0 = __builtin_arm_smlatb (tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, sum_0);
314- sum_0 = __builtin_arm_smlatb(kernel__y01_x00__y01_x01, tensor__y01_x01__y01_x02 , sum_0);
315- sum_0 = __builtin_arm_smlatb (tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, sum_0);
316- sum_0 = __builtin_arm_smlatb(kernel__y01_x02__y01_x03, tensor__y01_x03__y01_x04 , sum_0);
317- sum_0 = __builtin_arm_smlatb (tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, sum_0);
318- sum_0 = __builtin_arm_smlatb(kernel__y01_x04__y01_x05, tensor__y01_x05__y01_x06 , sum_0);
319- sum_0 = __builtin_arm_smlatb (tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, sum_0);
320- sum_0 = __builtin_arm_smlatb(kernel__y01_x06__y01_x07, tensor__y01_x07__y01_x08 , sum_0);
321- sum_0 = __builtin_arm_smlatb (tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, sum_0);
322- sum_0 = __builtin_arm_smlatt (tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, sum_0);
323- sum_0 = __builtin_arm_smlad (tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, sum_0);
324- sum_0 = __builtin_arm_smlad (tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, sum_0);
325- sum_0 = __builtin_arm_smlad (tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, sum_0);
326- sum_0 = __builtin_arm_smlad (tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, sum_0);
312+ sum_0 = __smlatt (tensor__unknown__y00_x00, kernel__unknown__y00_x00, sum_0);
313+ sum_0 = __smlad (tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, sum_0);
314+ sum_0 = __smlad (tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, sum_0);
315+ sum_0 = __smlad (tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, sum_0);
316+ sum_0 = __smlad (tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, sum_0);
317+ sum_0 = __smlatb (tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, sum_0);
318+ sum_0 = __smlabt(tensor__y01_x01__y01_x02, kernel__y01_x00__y01_x01 , sum_0);
319+ sum_0 = __smlatb (tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, sum_0);
320+ sum_0 = __smlabt(tensor__y01_x03__y01_x04, kernel__y01_x02__y01_x03 , sum_0);
321+ sum_0 = __smlatb (tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, sum_0);
322+ sum_0 = __smlabt(tensor__y01_x05__y01_x06, kernel__y01_x04__y01_x05 , sum_0);
323+ sum_0 = __smlatb (tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, sum_0);
324+ sum_0 = __smlabt(tensor__y01_x07__y01_x08, kernel__y01_x06__y01_x07 , sum_0);
325+ sum_0 = __smlatb (tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, sum_0);
326+ sum_0 = __smlatt (tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, sum_0);
327+ sum_0 = __smlad (tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, sum_0);
328+ sum_0 = __smlad (tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, sum_0);
329+ sum_0 = __smlad (tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, sum_0);
330+ sum_0 = __smlad (tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, sum_0);
327331
328332 int scale_val = *scale;
329333 int requant_0 = (sum_0 * (long long) scale_val) >> 39;
330334 requant_0 = (requant_0 + 1) >> 1;
331- requant_0 = __builtin_arm_ssat (requant_0 + 4, 8);
335+ requant_0 = __ssat (requant_0 + 4, 8);
332336
333337 ((short*) output)[1] = (short) requant_0;
334338 return 0;
0 commit comments