Re-use common_includes to propagate shared functions

Mousius · Mousius · commit 85cfa552dbbc · 2022-12-28T11:21:40.000Z
The packing definitions aren't implemented as ACLE intrinsics nor is there a simple way to convince a C compiler to generate them.
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
@@ -29,8 +29,8 @@
 #include <tvm/runtime/crt/error_codes.h>
 
 
-#ifndef ARM_CPU_ROR_EXISTS
-#define ARM_CPU_ROR_EXISTS
+#ifndef ARM_CPU_INTRINSICS_EXIST
+#define ARM_CPU_INTRINSICS_EXIST
 __attribute__((always_inline)) uint32_t __ror(uint32_t op1, uint32_t op2)
 {
   op2 %= 32U;
@@ -40,6 +40,25 @@
   }
   return (op1 >> op2) | (op1 << (32U - op2));
 }
+
+#define __pkhbt(ARG1,ARG2,ARG3) \
+__extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  __ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+
+#define __pkhtb(ARG1,ARG2,ARG3) \
+__extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  if (ARG3 == 0) \
+    __ASM ("pkhtb %0, %1, %2" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2)  ); \
+  else \
+    __ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
 #endif
 """
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
@@ -23,7 +23,7 @@
 import textwrap
 
 from tvm import te, tir
-from .common import num_simd_lanes_per_word
+from .common import num_simd_lanes_per_word, common_includes
 
 
 def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix):
@@ -107,10 +107,8 @@ def multi_channel_convolve_impl(in_dtype, *args) -> str:
 def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
     return textwrap.dedent(
         (
-            f"""
-        #include <stdint.h>
-        #include <arm_acle.h>
-
+            common_includes
+            + f"""
         // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
 
         #define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \
@@ -172,7 +170,8 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
 def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
     return textwrap.dedent(
         (
-            f"""
+            common_includes
+            + f"""
         #include <stdint.h>
 
         /* We do four channels at once to get this speed boost. */