Skip to content

Commit 85cfa55

Browse files
committed
Re-use common_includes to propagate shared functions
The packing definitions aren't implemented as ACLE intrinsics nor is there a simple way to convince a C compiler to generate them.
1 parent 4d330af commit 85cfa55

File tree

2 files changed

+26
-8
lines changed

2 files changed

+26
-8
lines changed

python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929
#include <tvm/runtime/crt/error_codes.h>
3030
3131
32-
#ifndef ARM_CPU_ROR_EXISTS
33-
#define ARM_CPU_ROR_EXISTS
32+
#ifndef ARM_CPU_INTRINSICS_EXIST
33+
#define ARM_CPU_INTRINSICS_EXIST
3434
__attribute__((always_inline)) uint32_t __ror(uint32_t op1, uint32_t op2)
3535
{
3636
op2 %= 32U;
@@ -40,6 +40,25 @@
4040
}
4141
return (op1 >> op2) | (op1 << (32U - op2));
4242
}
43+
44+
#define __pkhbt(ARG1,ARG2,ARG3) \
45+
__extension__ \
46+
({ \
47+
uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
48+
__ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \
49+
__RES; \
50+
})
51+
52+
#define __pkhtb(ARG1,ARG2,ARG3) \
53+
__extension__ \
54+
({ \
55+
uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
56+
if (ARG3 == 0) \
57+
__ASM ("pkhtb %0, %1, %2" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2) ); \
58+
else \
59+
__ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \
60+
__RES; \
61+
})
4362
#endif
4463
"""
4564

python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import textwrap
2424

2525
from tvm import te, tir
26-
from .common import num_simd_lanes_per_word
26+
from .common import num_simd_lanes_per_word, common_includes
2727

2828

2929
def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix):
@@ -107,10 +107,8 @@ def multi_channel_convolve_impl(in_dtype, *args) -> str:
107107
def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
108108
return textwrap.dedent(
109109
(
110-
f"""
111-
#include <stdint.h>
112-
#include <arm_acle.h>
113-
110+
common_includes
111+
+ f"""
114112
// __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
115113
116114
#define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \
@@ -172,7 +170,8 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
172170
def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
173171
return textwrap.dedent(
174172
(
175-
f"""
173+
common_includes
174+
+ f"""
176175
#include <stdint.h>
177176
178177
/* We do four channels at once to get this speed boost. */

0 commit comments

Comments
 (0)