[microTVM] Replace arm_nnsupportfunctions.h with arm_acle.h (#13363)

Mousius · Ashutosh Parkhi · web-flow · commit 6bc72bbca37c · 2023-01-09T16:30:44.000Z
* [microTVM] Replace arm_nnsupportfunctions.h with arm_acle.h

This attempts to replace the CMSIS-NN header with a more portable
alternative and avoid dependence on CMSIS

* Remove CMSIS __STATIC_FORCEINLINE macro

* Replace more intrinsics with ACLE variants

* Use builtins for intrinsics missing in older GCC

* Re-use common_includes to propagate shared functions

The packing definitions aren't implemented as ACLE intrinsics nor is there a simple way to convince a C compiler to generate them.

* Properly align memory access for

Introduce `memcpy` to explain to the compiler that we're changing
the alignment of `int16_t` to `int32_t`. What this appears to actually
do is encourage the compiler to use three loads rather than one double
load plus a regular load.

The padded array is aligned as an `int16_t`, it isn't guaranteed to
behave like an `int32_t` aligned array. One of the side effects of the
type punning from `int16_t*` to `int32_t*` is that we're effectively
lying to the compiler that this is correctly aligned and it can use
instructions which load multiple `int32_t`s at the same time - this does
not work 😿

Co-authored-by: Ashutosh Parkhi &lt;ashutosh.parkhi@arm.com&gt;
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
@@ -101,7 +101,7 @@ def sum_impl(N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif // __cplusplus
-__STATIC_FORCEINLINE int32_t sum16_reset_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t sum16_reset_{uniq_id}(
     int16_t *res) {{
   *res = (int16_t)0;
   return 0;
@@ -110,7 +110,7 @@ def sum_impl(N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t sum16_{N}_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t sum16_{N}_{uniq_id}(
     int16_t *arr,
     int16_t *res16,
     long arr_offset,
@@ -129,7 +129,7 @@ def sum_impl(N, uniq_id):
   }}
 
   for ( int i = 0; i < n / 2; ++ i ) {{
-    res = __SMLAD(*p32, 0x00010001, res);
+    res = __smlad(*p32, 0x00010001, res);
     ++ p32;
   }}
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
@@ -24,10 +24,42 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include <arm_nnsupportfunctions.h>
+#include <arm_acle.h>
 
 #include <tvm/runtime/crt/error_codes.h>
 
+
+#ifndef ARM_CPU_INTRINSICS_EXIST
+#define ARM_CPU_INTRINSICS_EXIST
+__attribute__((always_inline)) uint32_t __ror(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+#define __pkhbt(ARG1,ARG2,ARG3) \
+__extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  __asm("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+
+#define __pkhtb(ARG1,ARG2,ARG3) \
+__extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  if (ARG3 == 0) \
+    __asm("pkhtb %0, %1, %2" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2)  ); \
+  else \
+    __asm("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+#endif
 """
 
 MICRO_WORD_LENGTH_BITS = 32
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
@@ -132,12 +132,30 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
     cc_code = (
         common.common_includes
         + f"""
+#ifndef ARM_CPU_MPROFILE_READ_AND_PAD_EXISTS
+#define ARM_CPU_MPROFILE_READ_AND_PAD_EXISTS
+__attribute__((always_inline)) static inline const int8_t *read_and_pad(const int8_t *source, int32_t *out1, int32_t *out2)
+{{
+    int32_t inA;
+    memcpy(&inA, source, 4);
+    source += 4;
+
+    int32_t inAbuf1 = __sxtb16(__ror((uint32_t)inA, 8));
+    int32_t inAbuf2 = __sxtb16(inA);
+    *out2 = (int32_t)(__pkhtb(inAbuf1, inAbuf2, 16));
+    *out1 = (int32_t)(__pkhbt(inAbuf2, inAbuf1, 16));
+
+    return source;
+}}
+#endif
+"""
+        + f"""
 
 
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_body_rest_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{N}_body_rest_{uniq_id}(
     int K,
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
@@ -180,7 +198,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_body_loop_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
@@ -201,7 +219,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int16_t bb_pad[{bb_pad_size}];
@@ -226,7 +244,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
       int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
       int32_t sum = 0;
       for (int l = 0; l < 2 * ({K} / 4); l++) {{
-        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+        sum = __smlad(*aa_ptr, *bb_ptr, sum);
         ++ aa_ptr; ++ bb_ptr;
       }}
       // NOTE: this is the line where `*_body` differs from `*_update`. here
@@ -246,7 +264,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_update_rest_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{N}_update_rest_{uniq_id}(
     int K,
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
@@ -289,7 +307,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_update_loop_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
@@ -307,7 +325,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int16_t bb_pad[{bb_pad_size}];
@@ -332,7 +350,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
       int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
       int32_t sum = 0;
       for (int l = 0; l < 2 * ({K} / 4); l++) {{
-        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+        sum = __smlad(*aa_ptr, *bb_ptr, sum);
         ++ aa_ptr; ++ bb_ptr;
       }}
       cc[i*C_stride + j] += sum;
@@ -349,7 +367,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_body_rest_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{N}_body_rest_{uniq_id}(
     int K,
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
@@ -367,7 +385,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
@@ -388,7 +406,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int32_t retcode = 0;
@@ -405,13 +423,14 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
-      int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
-      int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
+      int32_t aa_vector[{K} / 2];
+      int32_t bb_vector[{K} / 2];
+      memcpy(&aa_vector, &aa[i * A_stride], sizeof(aa_vector));
+      memcpy(&bb_vector, &bb[j * B_stride], sizeof(bb_vector));
 
       int32_t sum = 0;
       for (int l = 0; l < {K} / 2; l++) {{
-        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
-        ++ aa_ptr; ++ bb_ptr;
+        sum = __smlad(aa_vector[l], bb_vector[l], sum);
       }}
       // NOTE: this is the line where `*_body` differs from `*_update`. here
       // we're *setting* the result, instead of accumulating, because we know
@@ -430,7 +449,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_update_rest_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{N}_update_rest_{uniq_id}(
     int K,
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
@@ -448,7 +467,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
@@ -466,7 +485,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int32_t retcode = 0;
@@ -478,13 +497,14 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
-      int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
-      int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
+      int32_t aa_vector[{K} / 2];
+      int32_t bb_vector[{K} / 2];
+      memcpy(&aa_vector, &aa[i * A_stride], sizeof(aa_vector));
+      memcpy(&bb_vector, &bb[j * B_stride], sizeof(bb_vector));
 
       int32_t sum = 0;
       for (int l = 0; l < {K} / 2; l++) {{
-        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
-        ++ aa_ptr; ++ bb_ptr;
+        sum = __smlad(aa_vector[l], bb_vector[l], sum);
       }}
       cc[i*C_stride + j] += sum;
     }}
@@ -500,7 +520,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C_stride) {{
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
       cc[i*C_stride + j] = 0;
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
@@ -94,7 +94,7 @@ def max_impl(uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t max8_reset_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t max8_reset_{uniq_id}(
     int8_t *res,
     int N) {{
   memset(res, (int8_t)-128, N * sizeof(*res));
@@ -104,7 +104,7 @@ def max_impl(uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t max8_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t max8_loop_{uniq_id}(
     int8_t *arg,
     int8_t *res,
     int N) {{
@@ -117,7 +117,7 @@ def max_impl(uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t max8_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t max8_{uniq_id}(
     int8_t *arg,
     int8_t *res,
     int N) {{
@@ -146,8 +146,8 @@ def max_impl(uniq_id):
   for ( int i = 0; i < N / 4; ++ i ) {{
     int32_t arg32 = *parg32 ++;
     int32_t res32 = *pres32;
-    __SSUB8(arg32, res32);
-    res32 = __SEL(arg32, res32);
+    __ssub8(arg32, res32);
+    res32 = __sel(arg32, res32);
     *pres32 ++ = res32;
   }}
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
@@ -23,7 +23,7 @@
 import textwrap
 
 from tvm import te, tir
-from .common import num_simd_lanes_per_word
+from .common import num_simd_lanes_per_word, common_includes
 
 
 def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix):
@@ -107,10 +107,8 @@ def multi_channel_convolve_impl(in_dtype, *args) -> str:
 def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
     return textwrap.dedent(
         (
-            f"""
-        #include <stdint.h>
-        #include <arm_nnsupportfunctions.h>
-
+            common_includes
+            + f"""
         // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
 
         #define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \
@@ -120,13 +118,13 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
           \
           uint32_t kernel_c3210 = *arranged_kernel++; \
           \
-          uint32_t tensor_c20 = __SXTB16(tensor_c3210); \
-          uint32_t kernel_c20 = __SXTB16(kernel_c3210); \
+          uint32_t tensor_c20 = __sxtb16(tensor_c3210); \
+          uint32_t kernel_c20 = __sxtb16(kernel_c3210); \
           sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
           sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
           \
-          uint32_t tensor_c31 = __SXTB16(__ROR(tensor_c3210, 8)); \
-          uint32_t kernel_c31 = __SXTB16(__ROR(kernel_c3210, 8)); \
+          uint32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
+          uint32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
           sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
           sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
         }}
@@ -172,7 +170,8 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
 def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
     return textwrap.dedent(
         (
-            f"""
+            common_includes
+            + f"""
         #include <stdint.h>
 
         /* We do four channels at once to get this speed boost. */