Skip to content

Commit 29d97a8

Browse files
committed
Fix tensordot opts test
1 parent af83462 commit 29d97a8

File tree

2 files changed

+73
-69
lines changed

2 files changed

+73
-69
lines changed

python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,9 @@ def get_var(y, x, halfwords) -> Tuple[str, str]:
140140
def _apply_simd_optimizations(instruction_tuples) -> Iterator[SMLAInstruction]:
141141
"""When possible, fuses single MACs into SIMD MAC instructions.
142142
143-
The compiler cannot do this automatically, as calling __builtin_arm_smlaxy forces the SMLAxy
144-
instruction to be used. This function takes as input an iterator of SMLAInstructions and returns
145-
an iterator of SMLAInstructions (possibly of different length).
143+
The compiler cannot do this automatically, as calling __smlaxy forces the SMLAxy instruction to
144+
be used. This function takes as input an iterator of SMLAInstructions and returns an iterator of
145+
SMLAInstructions (possibly of different length).
146146
"""
147147
curr_tuple = next(instruction_tuples, None)
148148
while curr_tuple:
@@ -197,16 +197,16 @@ def _requantize_sums(num_sums, requantize_shift, output_zero_point) -> Iterator[
197197
halfwords in a word, and rearrainging it would take at least one cycle. Two SSAT operations is
198198
just as good.
199199
200-
Calling __builtin_arm_ssat directly is a little bit gross, but GCC and Clang are unreliable
201-
about compiling other ways of writing this. Both the multiply + shift and shift + saturation
202-
combine to one instruction each.
200+
Calling __ssat directly is a little bit gross, but GCC and Clang are unreliable about compiling
201+
other ways of writing this. Both the multiply + shift and shift + saturation combine to one
202+
instruction each.
203203
"""
204204

205205
yield "int scale_val = *scale;"
206206
for i in range(num_sums):
207207
yield f"int requant_{i} = (sum_{i} * (long long) scale_val) >> {requantize_shift - 1};"
208208
yield f"requant_{i} = (requant_{i} + 1) >> 1;"
209-
yield f"requant_{i} = __builtin_arm_ssat(requant_{i} + {output_zero_point}, 8);"
209+
yield f"requant_{i} = __ssat(requant_{i} + {output_zero_point}, 8);"
210210

211211

212212
def _write_sums_to_memory(num_sums, offset, stride) -> Iterator[str]:

tests/python/topi/python/test_topi_conv2d_tensordot_opts.py

Lines changed: 66 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def test_write_3x3_depthwise_code():
6060
"""
6161
#ifndef TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
6262
#define TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
63+
#include <arm_acle.h>
6364
__attribute__((always_inline)) static inline int tensordot_opt_x1_int16_w48_3x3_000(
6465
int *output, int *tensor, int *kernel, int *bias, int *scale
6566
) {
@@ -78,18 +79,18 @@ def test_write_3x3_depthwise_code():
7879
int kernel__y02_x00__y02_x01 = kernel[3];
7980
int kernel__y02_x02__unknown = kernel[4];
8081
81-
sum_0 = __builtin_arm_smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
82-
sum_0 = __builtin_arm_smlabb(tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, sum_0);
83-
sum_0 = __builtin_arm_smlatb(kernel__y00_x02__y01_x00, tensor__y01_x00__y01_x01, sum_0);
84-
sum_0 = __builtin_arm_smlatb(tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, sum_0);
85-
sum_0 = __builtin_arm_smlatb(kernel__y01_x01__y01_x02, tensor__y01_x02__unknown, sum_0);
86-
sum_0 = __builtin_arm_smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
87-
sum_0 = __builtin_arm_smlabb(tensor__y02_x02__unknown, kernel__y02_x02__unknown, sum_0);
82+
sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
83+
sum_0 = __smlabb(tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, sum_0);
84+
sum_0 = __smlabt(tensor__y01_x00__y01_x01, kernel__y00_x02__y01_x00, sum_0);
85+
sum_0 = __smlatb(tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, sum_0);
86+
sum_0 = __smlabt(tensor__y01_x02__unknown, kernel__y01_x01__y01_x02, sum_0);
87+
sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
88+
sum_0 = __smlabb(tensor__y02_x02__unknown, kernel__y02_x02__unknown, sum_0);
8889
8990
int scale_val = *scale;
9091
int requant_0 = (sum_0 * (long long) scale_val) >> 32;
9192
requant_0 = (requant_0 + 1) >> 1;
92-
requant_0 = __builtin_arm_ssat(requant_0 + -128, 8);
93+
requant_0 = __ssat(requant_0 + -128, 8);
9394
9495
((short*) output)[0] = (short) requant_0;
9596
return 0;
@@ -112,6 +113,7 @@ def test_odd_width_3x3_depthwise_strides_code():
112113
"""
113114
#ifndef TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
114115
#define TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
116+
#include <arm_acle.h>
115117
__attribute__((always_inline)) static inline int tensordot_opt_x2_int16_w49_3x3_000_2_4(
116118
int *output, int *tensor, int *kernel, int *bias, int *scale
117119
) {
@@ -133,26 +135,26 @@ def test_odd_width_3x3_depthwise_strides_code():
133135
int kernel__y02_x00__y02_x01 = kernel[3];
134136
int kernel__y02_x02__unknown = kernel[4];
135137
136-
sum_0 = __builtin_arm_smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
137-
sum_0 = __builtin_arm_smlabb(tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, sum_0);
138-
sum_0 = __builtin_arm_smlatt(tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, sum_0);
139-
sum_0 = __builtin_arm_smlad(tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, sum_0);
140-
sum_0 = __builtin_arm_smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
141-
sum_0 = __builtin_arm_smlabb(tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, sum_0);
142-
sum_1 = __builtin_arm_smlad(tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, sum_1);
143-
sum_1 = __builtin_arm_smlabb(tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, sum_1);
144-
sum_1 = __builtin_arm_smlatt(tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, sum_1);
145-
sum_1 = __builtin_arm_smlad(tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, sum_1);
146-
sum_1 = __builtin_arm_smlad(tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, sum_1);
147-
sum_1 = __builtin_arm_smlabb(tensor__y02_x04__unknown, kernel__y02_x02__unknown, sum_1);
138+
sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
139+
sum_0 = __smlabb(tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, sum_0);
140+
sum_0 = __smlatt(tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, sum_0);
141+
sum_0 = __smlad(tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, sum_0);
142+
sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
143+
sum_0 = __smlabb(tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, sum_0);
144+
sum_1 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, sum_1);
145+
sum_1 = __smlabb(tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, sum_1);
146+
sum_1 = __smlatt(tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, sum_1);
147+
sum_1 = __smlad(tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, sum_1);
148+
sum_1 = __smlad(tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, sum_1);
149+
sum_1 = __smlabb(tensor__y02_x04__unknown, kernel__y02_x02__unknown, sum_1);
148150
149151
int scale_val = *scale;
150152
int requant_0 = (sum_0 * (long long) scale_val) >> 32;
151153
requant_0 = (requant_0 + 1) >> 1;
152-
requant_0 = __builtin_arm_ssat(requant_0 + -128, 8);
154+
requant_0 = __ssat(requant_0 + -128, 8);
153155
int requant_1 = (sum_1 * (long long) scale_val) >> 32;
154156
requant_1 = (requant_1 + 1) >> 1;
155-
requant_1 = __builtin_arm_ssat(requant_1 + -128, 8);
157+
requant_1 = __ssat(requant_1 + -128, 8);
156158
157159
((short*) output)[0] = (short) requant_0;
158160
((short*) output)[4] = (short) requant_1;
@@ -174,6 +176,7 @@ def test_1x1x8_convolution_code():
174176
"""
175177
#ifndef TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
176178
#define TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
179+
#include <arm_acle.h>
177180
__attribute__((always_inline)) static inline int tensordot_opt_x4_int16_w384_1x8_000_8_1(
178181
int *output, int *tensor, int *kernel, int *bias, int *scale
179182
) {
@@ -201,36 +204,36 @@ def test_1x1x8_convolution_code():
201204
int kernel__y00_x04__y00_x05 = kernel[2];
202205
int kernel__y00_x06__y00_x07 = kernel[3];
203206
204-
sum_0 = __builtin_arm_smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
205-
sum_0 = __builtin_arm_smlad(tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, sum_0);
206-
sum_0 = __builtin_arm_smlad(tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, sum_0);
207-
sum_0 = __builtin_arm_smlad(tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, sum_0);
208-
sum_1 = __builtin_arm_smlad(tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, sum_1);
209-
sum_1 = __builtin_arm_smlad(tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, sum_1);
210-
sum_1 = __builtin_arm_smlad(tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, sum_1);
211-
sum_1 = __builtin_arm_smlad(tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, sum_1);
212-
sum_2 = __builtin_arm_smlad(tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, sum_2);
213-
sum_2 = __builtin_arm_smlad(tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, sum_2);
214-
sum_2 = __builtin_arm_smlad(tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, sum_2);
215-
sum_2 = __builtin_arm_smlad(tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, sum_2);
216-
sum_3 = __builtin_arm_smlad(tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, sum_3);
217-
sum_3 = __builtin_arm_smlad(tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, sum_3);
218-
sum_3 = __builtin_arm_smlad(tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, sum_3);
219-
sum_3 = __builtin_arm_smlad(tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, sum_3);
207+
sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
208+
sum_0 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, sum_0);
209+
sum_0 = __smlad(tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, sum_0);
210+
sum_0 = __smlad(tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, sum_0);
211+
sum_1 = __smlad(tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, sum_1);
212+
sum_1 = __smlad(tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, sum_1);
213+
sum_1 = __smlad(tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, sum_1);
214+
sum_1 = __smlad(tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, sum_1);
215+
sum_2 = __smlad(tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, sum_2);
216+
sum_2 = __smlad(tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, sum_2);
217+
sum_2 = __smlad(tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, sum_2);
218+
sum_2 = __smlad(tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, sum_2);
219+
sum_3 = __smlad(tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, sum_3);
220+
sum_3 = __smlad(tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, sum_3);
221+
sum_3 = __smlad(tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, sum_3);
222+
sum_3 = __smlad(tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, sum_3);
220223
221224
int scale_val = *scale;
222225
int requant_0 = (sum_0 * (long long) scale_val) >> 32;
223226
requant_0 = (requant_0 + 1) >> 1;
224-
requant_0 = __builtin_arm_ssat(requant_0 + -128, 8);
227+
requant_0 = __ssat(requant_0 + -128, 8);
225228
int requant_1 = (sum_1 * (long long) scale_val) >> 32;
226229
requant_1 = (requant_1 + 1) >> 1;
227-
requant_1 = __builtin_arm_ssat(requant_1 + -128, 8);
230+
requant_1 = __ssat(requant_1 + -128, 8);
228231
int requant_2 = (sum_2 * (long long) scale_val) >> 32;
229232
requant_2 = (requant_2 + 1) >> 1;
230-
requant_2 = __builtin_arm_ssat(requant_2 + -128, 8);
233+
requant_2 = __ssat(requant_2 + -128, 8);
231234
int requant_3 = (sum_3 * (long long) scale_val) >> 32;
232235
requant_3 = (requant_3 + 1) >> 1;
233-
requant_3 = __builtin_arm_ssat(requant_3 + -128, 8);
236+
requant_3 = __ssat(requant_3 + -128, 8);
234237
235238
int packed_res_0 = requant_0 + (requant_1 << 16);
236239
int packed_res_1 = requant_2 + (requant_3 << 16);
@@ -269,6 +272,7 @@ def test_3x3x3_offset_convolution_code():
269272
"""
270273
#ifndef TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
271274
#define TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
275+
#include <arm_acle.h>
272276
__attribute__((always_inline)) static inline int tensordot_opt_x1_int16_w288_3x9_111(
273277
int *output, int *tensor, int *kernel, int *bias, int *scale
274278
) {
@@ -305,30 +309,30 @@ def test_3x3x3_offset_convolution_code():
305309
int kernel__y02_x05__y02_x06 = kernel[12];
306310
int kernel__y02_x07__y02_x08 = kernel[13];
307311
308-
sum_0 = __builtin_arm_smlatt(tensor__unknown__y00_x00, kernel__unknown__y00_x00, sum_0);
309-
sum_0 = __builtin_arm_smlad(tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, sum_0);
310-
sum_0 = __builtin_arm_smlad(tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, sum_0);
311-
sum_0 = __builtin_arm_smlad(tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, sum_0);
312-
sum_0 = __builtin_arm_smlad(tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, sum_0);
313-
sum_0 = __builtin_arm_smlatb(tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, sum_0);
314-
sum_0 = __builtin_arm_smlatb(kernel__y01_x00__y01_x01, tensor__y01_x01__y01_x02, sum_0);
315-
sum_0 = __builtin_arm_smlatb(tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, sum_0);
316-
sum_0 = __builtin_arm_smlatb(kernel__y01_x02__y01_x03, tensor__y01_x03__y01_x04, sum_0);
317-
sum_0 = __builtin_arm_smlatb(tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, sum_0);
318-
sum_0 = __builtin_arm_smlatb(kernel__y01_x04__y01_x05, tensor__y01_x05__y01_x06, sum_0);
319-
sum_0 = __builtin_arm_smlatb(tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, sum_0);
320-
sum_0 = __builtin_arm_smlatb(kernel__y01_x06__y01_x07, tensor__y01_x07__y01_x08, sum_0);
321-
sum_0 = __builtin_arm_smlatb(tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, sum_0);
322-
sum_0 = __builtin_arm_smlatt(tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, sum_0);
323-
sum_0 = __builtin_arm_smlad(tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, sum_0);
324-
sum_0 = __builtin_arm_smlad(tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, sum_0);
325-
sum_0 = __builtin_arm_smlad(tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, sum_0);
326-
sum_0 = __builtin_arm_smlad(tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, sum_0);
312+
sum_0 = __smlatt(tensor__unknown__y00_x00, kernel__unknown__y00_x00, sum_0);
313+
sum_0 = __smlad(tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, sum_0);
314+
sum_0 = __smlad(tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, sum_0);
315+
sum_0 = __smlad(tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, sum_0);
316+
sum_0 = __smlad(tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, sum_0);
317+
sum_0 = __smlatb(tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, sum_0);
318+
sum_0 = __smlabt(tensor__y01_x01__y01_x02, kernel__y01_x00__y01_x01, sum_0);
319+
sum_0 = __smlatb(tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, sum_0);
320+
sum_0 = __smlabt(tensor__y01_x03__y01_x04, kernel__y01_x02__y01_x03, sum_0);
321+
sum_0 = __smlatb(tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, sum_0);
322+
sum_0 = __smlabt(tensor__y01_x05__y01_x06, kernel__y01_x04__y01_x05, sum_0);
323+
sum_0 = __smlatb(tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, sum_0);
324+
sum_0 = __smlabt(tensor__y01_x07__y01_x08, kernel__y01_x06__y01_x07, sum_0);
325+
sum_0 = __smlatb(tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, sum_0);
326+
sum_0 = __smlatt(tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, sum_0);
327+
sum_0 = __smlad(tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, sum_0);
328+
sum_0 = __smlad(tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, sum_0);
329+
sum_0 = __smlad(tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, sum_0);
330+
sum_0 = __smlad(tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, sum_0);
327331
328332
int scale_val = *scale;
329333
int requant_0 = (sum_0 * (long long) scale_val) >> 39;
330334
requant_0 = (requant_0 + 1) >> 1;
331-
requant_0 = __builtin_arm_ssat(requant_0 + 4, 8);
335+
requant_0 = __ssat(requant_0 + 4, 8);
332336
333337
((short*) output)[1] = (short) requant_0;
334338
return 0;

0 commit comments

Comments
 (0)