@@ -116,15 +116,15 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
116116 tensor_c3210, \
117117 sum_c0, sum_c1, sum_c2, sum_c3) {{ \
118118 \
119- uint32_t kernel_c3210 = *arranged_kernel++; \
119+ int32_t kernel_c3210 = *arranged_kernel++; \
120120 \
121- uint32_t tensor_c20 = __sxtb16(tensor_c3210); \
122- uint32_t kernel_c20 = __sxtb16(kernel_c3210); \
121+ int32_t tensor_c20 = __sxtb16(tensor_c3210); \
122+ int32_t kernel_c20 = __sxtb16(kernel_c3210); \
123123 sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
124124 sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
125125 \
126- uint32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
127- uint32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
126+ int32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
127+ int32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
128128 sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
129129 sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
130130 }}
@@ -134,22 +134,30 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
134134 extern "C"
135135 #endif
136136 int32_t { _get_func_name ("int8" , tensor_w , channels , kernel_h , kernel_w , suffix )} (
137- uint32_t *out,
138- uint32_t *tensor,
139- uint32_t *kernel) {{
137+ int32_t *out,
138+ int8_t *tensor,
139+ int8_t *kernel) {{
140140
141- uint32_t sum_c0 = 0;
142- uint32_t sum_c1 = 0;
143- uint32_t sum_c2 = 0;
144- uint32_t sum_c3 = 0;
141+ int32_t sum_c0 = 0;
142+ int32_t sum_c1 = 0;
143+ int32_t sum_c2 = 0;
144+ int32_t sum_c3 = 0;
145+
146+ int32_t kernel_i32[{ kernel_h } * { kernel_w } ];
147+ memcpy(kernel_i32, kernel, { kernel_h } * { kernel_w } * sizeof(int32_t));
148+ int32_t *arranged_kernel = kernel_i32;
149+
150+ int32_t tensor_length = { ((kernel_w - 1 ) * (channels // 4 ) + (kernel_h - 1 ) * tensor_w * (channels // 4 )) + 1 } ;
151+ int32_t tensor_i32[tensor_length];
152+ memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));
145153
146154 #pragma GCC unroll 3
147155 for (int i = 0; i < { kernel_h } ; i++) {{
148156 #pragma GCC unroll 3
149157 for (int j = 0; j < { kernel_w } ; j++) {{
150158 TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP(
151- kernel ,
152- *(tensor + j * { channels // 4 } + i * { tensor_w * (channels // 4 )} ),
159+ arranged_kernel ,
160+ *(tensor_i32 + j * { channels // 4 } + i * { tensor_w * (channels // 4 )} ),
153161 sum_c0, sum_c1, sum_c2, sum_c3)
154162 }}
155163 }}
@@ -179,20 +187,26 @@ def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, k
179187 extern "C"
180188 #endif
181189 int32_t { _get_func_name ("int16" , tensor_w , channels , kernel_h , kernel_w , suffix )} (
182- uint32_t *out,
183- uint32_t *tensor,
184- uint32_t *kernel) {{
190+ int32_t *out,
191+ int16_t *tensor,
192+ int16_t *kernel) {{
193+
194+ int32_t sum_c0 = 0;
195+ int32_t sum_c1 = 0;
196+
197+ int32_t kernel_i32[{ kernel_h } * { kernel_w } ];
198+ memcpy(kernel_i32, kernel, { kernel_h } * { kernel_w } * sizeof(int32_t));
185199
186- uint32_t sum_c0 = 0;
187- uint32_t sum_c1 = 0;
200+ int32_t tensor_length = { ((kernel_w - 1 ) * (channels // 2 ) + (kernel_h - 1 ) * tensor_w * (channels // 2 )) + 1 } ;
201+ int32_t tensor_i32[tensor_length];
202+ memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));
188203
189204 #pragma GCC unroll 3
190205 for (int i = 0; i < { kernel_h } ; i++) {{
191206 #pragma GCC unroll 3
192207 for (int j = 0; j < { kernel_w } ; j++) {{
193- uint32_t tensor_c10 = *(tensor + j * { channels // 2 }
194- + i * { tensor_w * (channels // 2 )} );
195- uint32_t kernel_c10 = *kernel++;
208+ int32_t tensor_c10 = tensor_i32[j * { channels // 2 } + i * { tensor_w * (channels // 2 )} ];
209+ int32_t kernel_c10 = kernel_i32[{ kernel_w } * i + j];
196210 sum_c0 = __builtin_arm_smlabb(tensor_c10, kernel_c10, sum_c0);
197211 sum_c1 = __builtin_arm_smlatt(tensor_c10, kernel_c10, sum_c1);
198212 }}
0 commit comments