Skip to content

Commit 7ede828

Browse files
NicolaLancellottiAshutosh Parkhi
authored andcommitted
[CMSIS-NN] Reduction in code size of AOT test runner binary (apache#13815)
* [CMSIS-NN] Reduction in code size of AOT test runner binary Co-authored-by: Ashutosh Parkhi <[email protected]>
1 parent 55e6d25 commit 7ede828

File tree

2 files changed

+41
-25
lines changed

2 files changed

+41
-25
lines changed

python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,15 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
116116
tensor_c3210, \
117117
sum_c0, sum_c1, sum_c2, sum_c3) {{ \
118118
\
119-
uint32_t kernel_c3210 = *arranged_kernel++; \
119+
int32_t kernel_c3210 = *arranged_kernel++; \
120120
\
121-
uint32_t tensor_c20 = __sxtb16(tensor_c3210); \
122-
uint32_t kernel_c20 = __sxtb16(kernel_c3210); \
121+
int32_t tensor_c20 = __sxtb16(tensor_c3210); \
122+
int32_t kernel_c20 = __sxtb16(kernel_c3210); \
123123
sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
124124
sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
125125
\
126-
uint32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
127-
uint32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
126+
int32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
127+
int32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
128128
sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
129129
sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
130130
}}
@@ -134,22 +134,30 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
134134
extern "C"
135135
#endif
136136
int32_t {_get_func_name("int8", tensor_w, channels, kernel_h, kernel_w, suffix)}(
137-
uint32_t *out,
138-
uint32_t *tensor,
139-
uint32_t *kernel) {{
137+
int32_t *out,
138+
int8_t *tensor,
139+
int8_t *kernel) {{
140140
141-
uint32_t sum_c0 = 0;
142-
uint32_t sum_c1 = 0;
143-
uint32_t sum_c2 = 0;
144-
uint32_t sum_c3 = 0;
141+
int32_t sum_c0 = 0;
142+
int32_t sum_c1 = 0;
143+
int32_t sum_c2 = 0;
144+
int32_t sum_c3 = 0;
145+
146+
int32_t kernel_i32[{kernel_h} * {kernel_w}];
147+
memcpy(kernel_i32, kernel, {kernel_h} * {kernel_w} * sizeof(int32_t));
148+
int32_t *arranged_kernel = kernel_i32;
149+
150+
int32_t tensor_length = {((kernel_w - 1) * (channels // 4) + (kernel_h - 1) * tensor_w * (channels // 4)) + 1};
151+
int32_t tensor_i32[tensor_length];
152+
memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));
145153
146154
#pragma GCC unroll 3
147155
for (int i = 0; i < {kernel_h}; i++) {{
148156
#pragma GCC unroll 3
149157
for (int j = 0; j < {kernel_w}; j++) {{
150158
TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP(
151-
kernel,
152-
*(tensor + j * {channels // 4} + i * {tensor_w * (channels // 4)}),
159+
arranged_kernel,
160+
*(tensor_i32 + j * {channels // 4} + i * {tensor_w * (channels // 4)}),
153161
sum_c0, sum_c1, sum_c2, sum_c3)
154162
}}
155163
}}
@@ -179,20 +187,26 @@ def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, k
179187
extern "C"
180188
#endif
181189
int32_t {_get_func_name("int16", tensor_w, channels, kernel_h, kernel_w, suffix)}(
182-
uint32_t *out,
183-
uint32_t *tensor,
184-
uint32_t *kernel) {{
190+
int32_t *out,
191+
int16_t *tensor,
192+
int16_t *kernel) {{
193+
194+
int32_t sum_c0 = 0;
195+
int32_t sum_c1 = 0;
196+
197+
int32_t kernel_i32[{kernel_h} * {kernel_w}];
198+
memcpy(kernel_i32, kernel, {kernel_h} * {kernel_w} * sizeof(int32_t));
185199
186-
uint32_t sum_c0 = 0;
187-
uint32_t sum_c1 = 0;
200+
int32_t tensor_length = {((kernel_w - 1) * (channels // 2) + (kernel_h - 1) * tensor_w * (channels // 2)) + 1};
201+
int32_t tensor_i32[tensor_length];
202+
memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));
188203
189204
#pragma GCC unroll 3
190205
for (int i = 0; i < {kernel_h}; i++) {{
191206
#pragma GCC unroll 3
192207
for (int j = 0; j < {kernel_w}; j++) {{
193-
uint32_t tensor_c10 = *(tensor + j * {channels // 2}
194-
+ i * {tensor_w * (channels // 2)});
195-
uint32_t kernel_c10 = *kernel++;
208+
int32_t tensor_c10 = tensor_i32[j * {channels // 2} + i * {tensor_w * (channels // 2)}];
209+
int32_t kernel_c10 = kernel_i32[{kernel_w} * i + j];
196210
sum_c0 = __builtin_arm_smlabb(tensor_c10, kernel_c10, sum_c0);
197211
sum_c1 = __builtin_arm_smlatt(tensor_c10, kernel_c10, sum_c1);
198212
}}

tests/python/relay/aot/corstone300.mk

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ DRIVER_PATH=${ETHOSU_PATH}/core_driver
4343
CMSIS_PATH=${ETHOSU_PATH}/cmsis
4444
ETHOSU_PLATFORM_PATH=/opt/arm/ethosu/core_platform
4545
CORSTONE_300_PATH = ${ETHOSU_PLATFORM_PATH}/targets/corstone-300
46-
PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -Werror-implicit-function-declaration -mcpu=${MCPU}${MCPU_FLAGS} -mthumb -mfloat-abi=${MFLOAT_ABI} -std=gnu99
46+
PKG_COMPILE_OPTS = -Wall -Ofast -Wno-incompatible-pointer-types -Wno-format -Werror-implicit-function-declaration -mcpu=${MCPU}${MCPU_FLAGS} -mthumb -mfloat-abi=${MFLOAT_ABI} -std=gnu99
4747
CMAKE = /opt/arm/cmake/bin/cmake
4848
CC = arm-none-eabi-gcc
4949
AR = arm-none-eabi-ar
@@ -64,7 +64,8 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
6464
CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=${TVM_ROOT}/tests/python/contrib/test_ethosu/reference_system/arm-none-eabi-gcc.cmake \
6565
-DCMAKE_SYSTEM_PROCESSOR=${MCPU}
6666

67-
PKG_LDFLAGS = -lm -specs=nosys.specs -static -T ${AOT_TEST_ROOT}/corstone300.ld
67+
# -fdata-sections together with --gc-section may lead to smaller statically-linked executables
68+
PKG_LDFLAGS = -lm -specs=nosys.specs -static -Wl,--gc-sections -T ${AOT_TEST_ROOT}/corstone300.ld
6869

6970
$(ifeq VERBOSE,1)
7071
QUIET ?=
@@ -113,9 +114,10 @@ ${build_dir}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS)
113114
$(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_startup.a) $(abspath $(build_dir))/libcmsis_startup/*.o
114115
$(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_startup.a)
115116

117+
# -fdata-sections together with --gc-section may lead to smaller statically-linked executables
116118
${build_dir}/libcmsis_nn.a: $(CMSIS_NN_SRCS)
117119
$(QUIET)mkdir -p $(abspath $(build_dir)/libcmsis_nn)
118-
$(QUIET)cd $(abspath $(build_dir)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^
120+
$(QUIET)cd $(abspath $(build_dir)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -ffunction-sections -fdata-sections -D${ARM_CPU} $^
119121
$(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_nn.a) $(abspath $(build_dir))/libcmsis_nn/*.o
120122
$(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_nn.a)
121123

0 commit comments

Comments
 (0)