Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,15 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
tensor_c3210, \
sum_c0, sum_c1, sum_c2, sum_c3) {{ \
\
uint32_t kernel_c3210 = *arranged_kernel++; \
int32_t kernel_c3210 = *arranged_kernel++; \
\
uint32_t tensor_c20 = __sxtb16(tensor_c3210); \
uint32_t kernel_c20 = __sxtb16(kernel_c3210); \
int32_t tensor_c20 = __sxtb16(tensor_c3210); \
int32_t kernel_c20 = __sxtb16(kernel_c3210); \
sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
\
uint32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
uint32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
int32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
int32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
}}
Expand All @@ -134,22 +134,30 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
extern "C"
#endif
int32_t {_get_func_name("int8", tensor_w, channels, kernel_h, kernel_w, suffix)}(
uint32_t *out,
uint32_t *tensor,
uint32_t *kernel) {{
int32_t *out,
int8_t *tensor,
int8_t *kernel) {{

uint32_t sum_c0 = 0;
uint32_t sum_c1 = 0;
uint32_t sum_c2 = 0;
uint32_t sum_c3 = 0;
int32_t sum_c0 = 0;
int32_t sum_c1 = 0;
int32_t sum_c2 = 0;
int32_t sum_c3 = 0;

int32_t kernel_i32[{kernel_h} * {kernel_w}];
memcpy(kernel_i32, kernel, {kernel_h} * {kernel_w} * sizeof(int32_t));
int32_t *arranged_kernel = kernel_i32;

int32_t tensor_length = {((kernel_w - 1) * (channels // 4) + (kernel_h - 1) * tensor_w * (channels // 4)) + 1};
int32_t tensor_i32[tensor_length];
memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));

#pragma GCC unroll 3
for (int i = 0; i < {kernel_h}; i++) {{
#pragma GCC unroll 3
for (int j = 0; j < {kernel_w}; j++) {{
TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP(
kernel,
*(tensor + j * {channels // 4} + i * {tensor_w * (channels // 4)}),
arranged_kernel,
*(tensor_i32 + j * {channels // 4} + i * {tensor_w * (channels // 4)}),
sum_c0, sum_c1, sum_c2, sum_c3)
}}
}}
Expand Down Expand Up @@ -179,20 +187,26 @@ def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, k
extern "C"
#endif
int32_t {_get_func_name("int16", tensor_w, channels, kernel_h, kernel_w, suffix)}(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also make these changes for the int8 version of this function in the lines above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

uint32_t *out,
uint32_t *tensor,
uint32_t *kernel) {{
int32_t *out,
int16_t *tensor,
int16_t *kernel) {{

int32_t sum_c0 = 0;
int32_t sum_c1 = 0;

int32_t kernel_i32[{kernel_h} * {kernel_w}];
memcpy(kernel_i32, kernel, {kernel_h} * {kernel_w} * sizeof(int32_t));

uint32_t sum_c0 = 0;
uint32_t sum_c1 = 0;
int32_t tensor_length = {((kernel_w - 1) * (channels // 2) + (kernel_h - 1) * tensor_w * (channels // 2)) + 1};
int32_t tensor_i32[tensor_length];
memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));

#pragma GCC unroll 3
for (int i = 0; i < {kernel_h}; i++) {{
#pragma GCC unroll 3
for (int j = 0; j < {kernel_w}; j++) {{
uint32_t tensor_c10 = *(tensor + j * {channels // 2}
+ i * {tensor_w * (channels // 2)});
uint32_t kernel_c10 = *kernel++;
int32_t tensor_c10 = tensor_i32[j * {channels // 2} + i * {tensor_w * (channels // 2)}];
int32_t kernel_c10 = kernel_i32[{kernel_w} * i + j];
sum_c0 = __builtin_arm_smlabb(tensor_c10, kernel_c10, sum_c0);
sum_c1 = __builtin_arm_smlatt(tensor_c10, kernel_c10, sum_c1);
}}
Expand Down
8 changes: 5 additions & 3 deletions tests/python/relay/aot/corstone300.mk
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ DRIVER_PATH=${ETHOSU_PATH}/core_driver
CMSIS_PATH=${ETHOSU_PATH}/cmsis
ETHOSU_PLATFORM_PATH=/opt/arm/ethosu/core_platform
CORSTONE_300_PATH = ${ETHOSU_PLATFORM_PATH}/targets/corstone-300
PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -Werror-implicit-function-declaration -mcpu=${MCPU}${MCPU_FLAGS} -mthumb -mfloat-abi=${MFLOAT_ABI} -std=gnu99
PKG_COMPILE_OPTS = -Wall -Ofast -Wno-incompatible-pointer-types -Wno-format -Werror-implicit-function-declaration -mcpu=${MCPU}${MCPU_FLAGS} -mthumb -mfloat-abi=${MFLOAT_ABI} -std=gnu99
CMAKE = /opt/arm/cmake/bin/cmake
CC = arm-none-eabi-gcc
AR = arm-none-eabi-ar
Expand All @@ -64,7 +64,8 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=${TVM_ROOT}/tests/python/contrib/test_ethosu/reference_system/arm-none-eabi-gcc.cmake \
-DCMAKE_SYSTEM_PROCESSOR=${MCPU}

PKG_LDFLAGS = -lm -specs=nosys.specs -static -T ${AOT_TEST_ROOT}/corstone300.ld
# -fdata-sections together with --gc-section may lead to smaller statically-linked executables
PKG_LDFLAGS = -lm -specs=nosys.specs -static -Wl,--gc-sections -T ${AOT_TEST_ROOT}/corstone300.ld

$(ifeq VERBOSE,1)
QUIET ?=
Expand Down Expand Up @@ -113,9 +114,10 @@ ${build_dir}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS)
$(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_startup.a) $(abspath $(build_dir))/libcmsis_startup/*.o
$(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_startup.a)

# -fdata-sections together with --gc-section may lead to smaller statically-linked executables
${build_dir}/libcmsis_nn.a: $(CMSIS_NN_SRCS)
$(QUIET)mkdir -p $(abspath $(build_dir)/libcmsis_nn)
$(QUIET)cd $(abspath $(build_dir)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^
$(QUIET)cd $(abspath $(build_dir)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -ffunction-sections -fdata-sections -D${ARM_CPU} $^
$(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_nn.a) $(abspath $(build_dir))/libcmsis_nn/*.o
$(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_nn.a)

Expand Down