Skip to content

Commit 4d330af

Browse files
committed
Use builtins for intrinsics missing in older GCC
1 parent 05f2709 commit 4d330af

File tree

1 file changed

+6
-6
lines changed

1 file changed

+6
-6
lines changed

python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,13 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
122122
\
123123
uint32_t tensor_c20 = __sxtb16(tensor_c3210); \
124124
uint32_t kernel_c20 = __sxtb16(kernel_c3210); \
125-
sum_c0 = __smlabb(tensor_c20, kernel_c20, sum_c0); \
126-
sum_c2 = __smlatt(tensor_c20, kernel_c20, sum_c2); \
125+
sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
126+
sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
127127
\
128128
uint32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
129129
uint32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
130-
sum_c1 = __smlabb(tensor_c31, kernel_c31, sum_c1); \
131-
sum_c3 = __smlatt(tensor_c31, kernel_c31, sum_c3); \
130+
sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
131+
sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
132132
}}
133133
134134
/* We do four channels at once to get this speed boost. */
@@ -194,8 +194,8 @@ def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, k
194194
uint32_t tensor_c10 = *(tensor + j * {channels // 2}
195195
+ i * {tensor_w * (channels // 2)});
196196
uint32_t kernel_c10 = *kernel++;
197-
sum_c0 = __smlabb(tensor_c10, kernel_c10, sum_c0);
198-
sum_c1 = __smlatt(tensor_c10, kernel_c10, sum_c1);
197+
sum_c0 = __builtin_arm_smlabb(tensor_c10, kernel_c10, sum_c0);
198+
sum_c1 = __builtin_arm_smlatt(tensor_c10, kernel_c10, sum_c1);
199199
}}
200200
}}
201201

0 commit comments

Comments
 (0)