Skip to content

Commit af83462

Browse files
committed
Second round of code review
1 parent 7085926 commit af83462

File tree

4 files changed

+40
-27
lines changed

4 files changed

+40
-27
lines changed

python/tvm/relay/qnn/strategy/arm_cpu.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,16 @@
2828

2929
@qnn_conv2d_strategy.register("arm_cpu")
3030
def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
31-
"""qnn.conv2d strategy for Arm Cortex-M CPUs with DSP."""
31+
"""qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
32+
33+
When computing convolutions, we want data that will be used to compute the same output values to
34+
be adjacent in memory, as this lets us reuse memory loads and use more SIMD instructions.
35+
36+
For depthwise convolutions, channels do not interact with each other, so the NCHW and IOHW
37+
layouts to the best job of keeping "related" data close. In contrast, computing one output of a
38+
regular convolution requires reading all input channels, so NHWC and OHWI are best. Hence, these
39+
are the layouts we support.
40+
"""
3241

3342
if not (target.features.has_dsp and "cortex-m" in target.mcpu):
3443
raise TVMError(

python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,13 @@
2222
and kernel layout OIHW.
2323
"""
2424

25+
from collections import namedtuple
2526
from itertools import chain
2627
import textwrap
2728
from typing import Iterator, Tuple
2829

30+
SMLAInstruction = namedtuple("SMLAInstruction", ["instruction", "tensor_var", "kernel_var"])
31+
2932

3033
def _get_c_function_name(split_size, dimensions, offsets, x_strides):
3134
"""Generates a C function name for tensordot.
@@ -106,18 +109,20 @@ def _load_kernel_vars(halfwords) -> Iterator[str]:
106109
yield f"int kernel__{var_name} = kernel[{i // 2}];"
107110

108111

109-
def _get_draft_macs(kernel_dims, tensor_halfwords, kernel_halfwords, offset) -> Iterator[Tuple]:
110-
"""Generates an un-optimized list of multiply-accumulate instructions.
111-
112-
We will optimize these into SIMD instructions later. The tuples in the returned iterator are
113-
organized as:
112+
def _get_draft_macs(
113+
kernel_dims, tensor_halfwords, kernel_halfwords, offset
114+
) -> Iterator[SMLAInstruction]:
115+
"""Generates unrolled MAC instructions to compute one tensordot sum.
114116
115-
(instruction, (arg1_y, arg1_x), (arg2_y, arg2_x))
117+
Unrolling these loops increases code size a tiny bit (< 0.02 KB), but makes the generated code
118+
much faster. The generated code does not use SIMD instructions - they are added later by
119+
_apply_simd_optimizations.
116120
117-
We return an iterator so that optimizations may be done by iterator chaining.
121+
We return an iterator of SMLAInstruction named tuples. Returning an iterator lets us do
122+
optimizations by iterator chaining.
118123
"""
119124

120-
def get_var(y, x, halfwords):
125+
def get_var(y, x, halfwords) -> Tuple[str, str]:
121126
i = halfwords.index((y, x))
122127
if i % 2 == 0:
123128
return f"{_get_int16_alias((y, x))}__{_get_int16_alias(halfwords[i + 1])}", "b"
@@ -129,15 +134,15 @@ def get_var(y, x, halfwords):
129134
tensor_var, tensor_half = get_var(y, x + offset, tensor_halfwords)
130135
kernel_var, kernel_half = get_var(y, x, kernel_halfwords)
131136
instruction = f"smla{tensor_half}{kernel_half}"
132-
yield instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}"
137+
yield SMLAInstruction(instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}")
133138

134139

135-
def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
140+
def _apply_simd_optimizations(instruction_tuples) -> Iterator[SMLAInstruction]:
136141
"""When possible, fuses single MACs into SIMD MAC instructions.
137142
138143
The compiler cannot do this automatically, as calling __builtin_arm_smlaxy forces the SMLAxy
139-
instruction to be used. This function takes as input an iterator of (instruction, var1, var2)
140-
tuples, and returns an iterator of (instruction, var1, var2) tuples.
144+
instruction to be used. This function takes as input an iterator of SMLAInstructions and returns
145+
an iterator of SMLAInstructions (possibly of different length).
141146
"""
142147
curr_tuple = next(instruction_tuples, None)
143148
while curr_tuple:
@@ -148,10 +153,10 @@ def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
148153

149154
if curr_tuple[1:] == next_tuple[1:]:
150155
if set([curr_tuple[0], next_tuple[0]]) == set(["smlatt", "smlabb"]):
151-
yield ("smlad", *curr_tuple[1:])
156+
yield SMLAInstruction("smlad", *curr_tuple[1:])
152157
next_tuple = next(instruction_tuples, None)
153158
elif set([curr_tuple[0], next_tuple[0]]) == set(["smlatb", "smlabt"]):
154-
yield ("smladx", *curr_tuple[1:])
159+
yield SMLAInstruction("smladx", *curr_tuple[1:])
155160
next_tuple = next(instruction_tuples, None)
156161
else:
157162
yield curr_tuple
@@ -162,7 +167,7 @@ def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
162167

163168

164169
def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
165-
"""Converts a series of (instruction, var1, var2) tuples into lines of C code.
170+
"""Converts an iterator of SMLAInstructions into lines of C code.
166171
167172
We want the compiler to re-order these with the memory loads, so we generate them as a series of
168173
calls to instruction aliases instead of as a single `asm` block.
@@ -171,14 +176,9 @@ def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
171176
for instruction, op1, op2 in instruction_tuples:
172177
assert "smla" in instruction
173178

174-
# Arm GCC does not have `__builtin_arm_smlabt`, even though `__builtin_arm_smlatt`,
175-
# `__builtin_arm_smlatb`, `__builtin_arm_smlad` and so on all exist. Perhaps this is a
176-
# choice, since we can just use `smlabt` with the argument order swapped instead? Note that
177-
# `__builtin_arm_smlabt` exists on most compilers (e.g. Clang) - this is just a GCC thing.
178-
if instruction == "smlabt":
179-
yield f"sum_{index} = __builtin_arm_smlatb({op2}, {op1}, sum_{index});"
180-
else:
181-
yield f"sum_{index} = __builtin_arm_{instruction}({op1}, {op2}, sum_{index});"
179+
# We call the instruction using the Arm C Language Extensions. Using ACLE gives better
180+
# cross-compiler compatibility than using __builtin functions.
181+
yield f"sum_{index} = __{instruction}({op1}, {op2}, sum_{index});"
182182

183183

184184
def _requantize_sums(num_sums, requantize_shift, output_zero_point) -> Iterator[str]:
@@ -324,6 +324,7 @@ def insert_lines(lines):
324324
f"""
325325
#ifndef {function_name.upper()}_EXISTS
326326
#define {function_name.upper()}_EXISTS
327+
#include <arm_acle.h>
327328
__attribute__((always_inline)) static inline int {function_name}(
328329
int *output, int *tensor, int *kernel, int *bias, int *scale
329330
) {{

src/relay/qnn/op/convolution.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
5555
<< "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype;
5656
ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8) ||
5757
weight->dtype == DataType::Int(16))
58-
<< "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
58+
<< "Expected qnn conv2d type(int8, uint8, int16) for weight but was " << weight->dtype;
5959
ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) ||
6060
param->out_dtype == DataType::Int(64))
6161
<< "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype;

tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -292,8 +292,11 @@ def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer):
292292
Loads the input, kernel, bias, expected output, and quantization parameters from the specified
293293
layer in a TFLite Interpreter. That information is used to construct a Relay Function with the
294294
same structure. The Function is run using microTVM and AOTTestModel, and we verify microTVM's
295-
output is the same as the TFLite ground truth. Only works for 2D convolutions (depthwise and
296-
regular).
295+
output is the same as the TFLite ground truth.
296+
297+
This function only cross-checks the first 23 layers in MobileNetV1, which are regular and
298+
depthwise 2D convolutions (this function only works for 2D convolutions). We do not test the
299+
average pool, dense, or softmax layers at the end of the model.
297300
298301
Note that we disable the QNN Legalization pass. This allows TVM to use its QNN compute
299302
definitions, fuse the three operations together, and perform other optimizations.

0 commit comments

Comments
 (0)