2222and kernel layout OIHW.
2323"""
2424
25+ from collections import namedtuple
2526from itertools import chain
2627import textwrap
2728from typing import Iterator , Tuple
2829
30+ SMLAInstruction = namedtuple ("SMLAInstruction" , ["instruction" , "tensor_var" , "kernel_var" ])
31+
2932
3033def _get_c_function_name (split_size , dimensions , offsets , x_strides ):
3134 """Generates a C function name for tensordot.
@@ -106,18 +109,20 @@ def _load_kernel_vars(halfwords) -> Iterator[str]:
106109 yield f"int kernel__{ var_name } = kernel[{ i // 2 } ];"
107110
108111
109- def _get_draft_macs (kernel_dims , tensor_halfwords , kernel_halfwords , offset ) -> Iterator [Tuple ]:
110- """Generates an un-optimized list of multiply-accumulate instructions.
111-
112- We will optimize these into SIMD instructions later. The tuples in the returned iterator are
113- organized as:
112+ def _get_draft_macs (
113+ kernel_dims , tensor_halfwords , kernel_halfwords , offset
114+ ) -> Iterator [SMLAInstruction ]:
115+ """Generates unrolled MAC instructions to compute one tensordot sum.
114116
115- (instruction, (arg1_y, arg1_x), (arg2_y, arg2_x))
117+ Unrolling these loops increases code size a tiny bit (< 0.02 KB), but makes the generated code
118+ much faster. The generated code does not use SIMD instructions - they are added later by
119+ _apply_simd_optimizations.
116120
117- We return an iterator so that optimizations may be done by iterator chaining.
121+ We return an iterator of SMLAInstruction named tuples. Returning an iterator lets us do
122+ optimizations by iterator chaining.
118123 """
119124
120- def get_var (y , x , halfwords ):
125+ def get_var (y , x , halfwords ) -> Tuple [ str , str ] :
121126 i = halfwords .index ((y , x ))
122127 if i % 2 == 0 :
123128 return f"{ _get_int16_alias ((y , x ))} __{ _get_int16_alias (halfwords [i + 1 ])} " , "b"
@@ -129,15 +134,15 @@ def get_var(y, x, halfwords):
129134 tensor_var , tensor_half = get_var (y , x + offset , tensor_halfwords )
130135 kernel_var , kernel_half = get_var (y , x , kernel_halfwords )
131136 instruction = f"smla{ tensor_half } { kernel_half } "
132- yield instruction , f"tensor__{ tensor_var } " , f"kernel__{ kernel_var } "
137+ yield SMLAInstruction ( instruction , f"tensor__{ tensor_var } " , f"kernel__{ kernel_var } " )
133138
134139
135- def _apply_simd_optimizations (instruction_tuples ) -> Iterator [Tuple ]:
140+ def _apply_simd_optimizations (instruction_tuples ) -> Iterator [SMLAInstruction ]:
136141 """When possible, fuses single MACs into SIMD MAC instructions.
137142
138143 The compiler cannot do this automatically, as calling __builtin_arm_smlaxy forces the SMLAxy
139- instruction to be used. This function takes as input an iterator of (instruction, var1, var2)
140- tuples, and returns an iterator of (instruction, var1, var2) tuples .
144+ instruction to be used. This function takes as input an iterator of SMLAInstructions and returns
145+ an iterator of SMLAInstructions (possibly of different length) .
141146 """
142147 curr_tuple = next (instruction_tuples , None )
143148 while curr_tuple :
@@ -148,10 +153,10 @@ def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
148153
149154 if curr_tuple [1 :] == next_tuple [1 :]:
150155 if set ([curr_tuple [0 ], next_tuple [0 ]]) == set (["smlatt" , "smlabb" ]):
151- yield ("smlad" , * curr_tuple [1 :])
156+ yield SMLAInstruction ("smlad" , * curr_tuple [1 :])
152157 next_tuple = next (instruction_tuples , None )
153158 elif set ([curr_tuple [0 ], next_tuple [0 ]]) == set (["smlatb" , "smlabt" ]):
154- yield ("smladx" , * curr_tuple [1 :])
159+ yield SMLAInstruction ("smladx" , * curr_tuple [1 :])
155160 next_tuple = next (instruction_tuples , None )
156161 else :
157162 yield curr_tuple
@@ -162,7 +167,7 @@ def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
162167
163168
164169def _expand_instruction_tuples (instruction_tuples , index ) -> Iterator [str ]:
165- """Converts a series of (instruction, var1, var2) tuples into lines of C code.
170+ """Converts an iterator of SMLAInstructions into lines of C code.
166171
167172 We want the compiler to re-order these with the memory loads, so we generate them as a series of
168173 calls to instruction aliases instead of as a single `asm` block.
@@ -171,14 +176,9 @@ def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
171176 for instruction , op1 , op2 in instruction_tuples :
172177 assert "smla" in instruction
173178
174- # Arm GCC does not have `__builtin_arm_smlabt`, even though `__builtin_arm_smlatt`,
175- # `__builtin_arm_smlatb`, `__builtin_arm_smlad` and so on all exist. Perhaps this is a
176- # choice, since we can just use `smlabt` with the argument order swapped instead? Note that
177- # `__builtin_arm_smlabt` exists on most compilers (e.g. Clang) - this is just a GCC thing.
178- if instruction == "smlabt" :
179- yield f"sum_{ index } = __builtin_arm_smlatb({ op2 } , { op1 } , sum_{ index } );"
180- else :
181- yield f"sum_{ index } = __builtin_arm_{ instruction } ({ op1 } , { op2 } , sum_{ index } );"
179+ # We call the instruction using the Arm C Language Extensions. Using ACLE gives better
180+ # cross-compiler compatibility than using __builtin functions.
181+ yield f"sum_{ index } = __{ instruction } ({ op1 } , { op2 } , sum_{ index } );"
182182
183183
184184def _requantize_sums (num_sums , requantize_shift , output_zero_point ) -> Iterator [str ]:
@@ -324,6 +324,7 @@ def insert_lines(lines):
324324 f"""
325325 #ifndef { function_name .upper ()} _EXISTS
326326 #define { function_name .upper ()} _EXISTS
327+ #include <arm_acle.h>
327328 __attribute__((always_inline)) static inline int { function_name } (
328329 int *output, int *tensor, int *kernel, int *bias, int *scale
329330 ) {{
0 commit comments