Second round of code review

guberti · guberti · commit af83462b60e8 · 2022-11-23T09:10:07.000-08:00
diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -28,7 +28,16 @@
 
 @qnn_conv2d_strategy.register("arm_cpu")
 def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
-    """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP."""
+    """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
+
+    When computing convolutions, we want data that will be used to compute the same output values to
+    be adjacent in memory, as this lets us reuse memory loads and use more SIMD instructions.
+
+    For depthwise convolutions, channels do not interact with each other, so the NCHW and IOHW
+    layouts to the best job of keeping "related" data close. In contrast, computing one output of a
+    regular convolution requires reading all input channels, so NHWC and OHWI are best. Hence, these
+    are the layouts we support.
+    """
 
     if not (target.features.has_dsp and "cortex-m" in target.mcpu):
         raise TVMError(
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -22,10 +22,13 @@
 and kernel layout OIHW.
 """
 
+from collections import namedtuple
 from itertools import chain
 import textwrap
 from typing import Iterator, Tuple
 
+SMLAInstruction = namedtuple("SMLAInstruction", ["instruction", "tensor_var", "kernel_var"])
+
 
 def _get_c_function_name(split_size, dimensions, offsets, x_strides):
     """Generates a C function name for tensordot.
@@ -106,18 +109,20 @@ def _load_kernel_vars(halfwords) -> Iterator[str]:
         yield f"int kernel__{var_name} = kernel[{i // 2}];"
 
 
-def _get_draft_macs(kernel_dims, tensor_halfwords, kernel_halfwords, offset) -> Iterator[Tuple]:
-    """Generates an un-optimized list of multiply-accumulate instructions.
-
-    We will optimize these into SIMD instructions later. The tuples in the returned iterator are
-    organized as:
+def _get_draft_macs(
+    kernel_dims, tensor_halfwords, kernel_halfwords, offset
+) -> Iterator[SMLAInstruction]:
+    """Generates unrolled MAC instructions to compute one tensordot sum.
 
-    (instruction, (arg1_y, arg1_x), (arg2_y, arg2_x))
+    Unrolling these loops increases code size a tiny bit (< 0.02 KB), but makes the generated code
+    much faster. The generated code does not use SIMD instructions - they are added later by
+    _apply_simd_optimizations.
 
-    We return an iterator so that optimizations may be done by iterator chaining.
+    We return an iterator of SMLAInstruction named tuples. Returning an iterator lets us do
+    optimizations by iterator chaining.
     """
 
-    def get_var(y, x, halfwords):
+    def get_var(y, x, halfwords) -> Tuple[str, str]:
         i = halfwords.index((y, x))
         if i % 2 == 0:
             return f"{_get_int16_alias((y, x))}__{_get_int16_alias(halfwords[i + 1])}", "b"
@@ -129,15 +134,15 @@ def get_var(y, x, halfwords):
             tensor_var, tensor_half = get_var(y, x + offset, tensor_halfwords)
             kernel_var, kernel_half = get_var(y, x, kernel_halfwords)
             instruction = f"smla{tensor_half}{kernel_half}"
-            yield instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}"
+            yield SMLAInstruction(instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}")
 
 
-def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
+def _apply_simd_optimizations(instruction_tuples) -> Iterator[SMLAInstruction]:
     """When possible, fuses single MACs into SIMD MAC instructions.
 
     The compiler cannot do this automatically, as calling __builtin_arm_smlaxy forces the SMLAxy
-    instruction to be used. This function takes as input an iterator of (instruction, var1, var2)
-    tuples, and returns an iterator of (instruction, var1, var2) tuples.
+    instruction to be used. This function takes as input an iterator of SMLAInstructions and returns
+    an iterator of SMLAInstructions (possibly of different length).
     """
     curr_tuple = next(instruction_tuples, None)
     while curr_tuple:
@@ -148,10 +153,10 @@ def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
 
         if curr_tuple[1:] == next_tuple[1:]:
             if set([curr_tuple[0], next_tuple[0]]) == set(["smlatt", "smlabb"]):
-                yield ("smlad", *curr_tuple[1:])
+                yield SMLAInstruction("smlad", *curr_tuple[1:])
                 next_tuple = next(instruction_tuples, None)
             elif set([curr_tuple[0], next_tuple[0]]) == set(["smlatb", "smlabt"]):
-                yield ("smladx", *curr_tuple[1:])
+                yield SMLAInstruction("smladx", *curr_tuple[1:])
                 next_tuple = next(instruction_tuples, None)
             else:
                 yield curr_tuple
@@ -162,7 +167,7 @@ def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
 
 
 def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
-    """Converts a series of (instruction, var1, var2) tuples into lines of C code.
+    """Converts an iterator of SMLAInstructions into lines of C code.
 
     We want the compiler to re-order these with the memory loads, so we generate them as a series of
     calls to instruction aliases instead of as a single `asm` block.
@@ -171,14 +176,9 @@ def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
     for instruction, op1, op2 in instruction_tuples:
         assert "smla" in instruction
 
-        # Arm GCC does not have `__builtin_arm_smlabt`, even though `__builtin_arm_smlatt`,
-        # `__builtin_arm_smlatb`, `__builtin_arm_smlad` and so on all exist. Perhaps this is a
-        # choice, since we can just use `smlabt` with the argument order swapped instead? Note that
-        # `__builtin_arm_smlabt` exists on most compilers (e.g. Clang) - this is just a GCC thing.
-        if instruction == "smlabt":
-            yield f"sum_{index} = __builtin_arm_smlatb({op2}, {op1}, sum_{index});"
-        else:
-            yield f"sum_{index} = __builtin_arm_{instruction}({op1}, {op2}, sum_{index});"
+        # We call the instruction using the Arm C Language Extensions. Using ACLE gives better
+        # cross-compiler compatibility than using __builtin functions.
+        yield f"sum_{index} = __{instruction}({op1}, {op2}, sum_{index});"
 
 
 def _requantize_sums(num_sums, requantize_shift, output_zero_point) -> Iterator[str]:
@@ -324,6 +324,7 @@ def insert_lines(lines):
         f"""
         #ifndef {function_name.upper()}_EXISTS
         #define {function_name.upper()}_EXISTS
+        #include <arm_acle.h>
         __attribute__((always_inline)) static inline int {function_name}(
             int *output, int *tensor, int *kernel, int *bias, int *scale
         ) {{
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
@@ -55,7 +55,7 @@ bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype;
   ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8) ||
          weight->dtype == DataType::Int(16))
-      << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
+      << "Expected qnn conv2d type(int8, uint8, int16) for weight but was " << weight->dtype;
   ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) ||
          param->out_dtype == DataType::Int(64))
       << "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype;
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -292,8 +292,11 @@ def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer):
     Loads the input, kernel, bias, expected output, and quantization parameters from the specified
     layer in a TFLite Interpreter. That information is used to construct a Relay Function with the
     same structure. The Function is run using microTVM and AOTTestModel, and we verify microTVM's
-    output is the same as the TFLite ground truth. Only works for 2D convolutions (depthwise and
-    regular).
+    output is the same as the TFLite ground truth.
+
+    This function only cross-checks the first 23 layers in MobileNetV1, which are regular and
+    depthwise 2D convolutions (this function only works for 2D convolutions). We do not test the
+    average pool, dense, or softmax layers at the end of the model.
 
     Note that we disable the QNN Legalization pass. This allows TVM to use its QNN compute
     definitions, fuse the three operations together, and perform other optimizations.