@@ -122,13 +122,13 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
122122 \
123123 uint32_t tensor_c20 = __sxtb16(tensor_c3210); \
124124 uint32_t kernel_c20 = __sxtb16(kernel_c3210); \
125- sum_c0 = __smlabb (tensor_c20, kernel_c20, sum_c0); \
126- sum_c2 = __smlatt (tensor_c20, kernel_c20, sum_c2); \
125+ sum_c0 = __builtin_arm_smlabb (tensor_c20, kernel_c20, sum_c0); \
126+ sum_c2 = __builtin_arm_smlatt (tensor_c20, kernel_c20, sum_c2); \
127127 \
128128 uint32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
129129 uint32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
130- sum_c1 = __smlabb (tensor_c31, kernel_c31, sum_c1); \
131- sum_c3 = __smlatt (tensor_c31, kernel_c31, sum_c3); \
130+ sum_c1 = __builtin_arm_smlabb (tensor_c31, kernel_c31, sum_c1); \
131+ sum_c3 = __builtin_arm_smlatt (tensor_c31, kernel_c31, sum_c3); \
132132 }}
133133
134134 /* We do four channels at once to get this speed boost. */
@@ -194,8 +194,8 @@ def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, k
194194 uint32_t tensor_c10 = *(tensor + j * { channels // 2 }
195195 + i * { tensor_w * (channels // 2 )} );
196196 uint32_t kernel_c10 = *kernel++;
197- sum_c0 = __smlabb (tensor_c10, kernel_c10, sum_c0);
198- sum_c1 = __smlatt (tensor_c10, kernel_c10, sum_c1);
197+ sum_c0 = __builtin_arm_smlabb (tensor_c10, kernel_c10, sum_c0);
198+ sum_c1 = __builtin_arm_smlatt (tensor_c10, kernel_c10, sum_c1);
199199 }}
200200 }}
201201
0 commit comments