apache
diff --git a/‎python/tvm/relay/op/nn/_nn.py‎
Lines changed: 5 additions & 3 deletions b/‎python/tvm/relay/op/nn/_nn.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎python/tvm/relay/qnn/strategy/arm_cpu.py‎
Lines changed: 9 additions & 8 deletions b/‎python/tvm/relay/qnn/strategy/arm_cpu.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py‎
Lines changed: 73 additions & 35 deletions b/‎python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py‎
Lines changed: 73 additions & 35 deletions
@@ -880,9 +880,11 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
 # QNN ops
 @reg.register_alter_op_layout("add")
 def alter_op_layout_add(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a add op. Useful for fusing the bias constant with an input zero point
-    constant in a previous quantized op. Only used when previous op is a quantized op, which is why
-    it lives in topi.nn.qnn."""
+    """Alter the layout of a add op.
+
+    Useful for fusing the bias constant with an input zero point constant in a previous quantized
+    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
+    """
     return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
 
 
 
@@ -14,23 +14,24 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Quantized operator strategy for Arm CPU. These schedules are only used if the qnn.Legalize pass
-is disabled. These schedules only work on fused operators with a bias, as this is a very common use
-case. Currently only regular/depthwise conv2d is supported, but qnn_dense should be added."""
+"""Quantized operator strategy for Arm CPU.
 
-from tvm import topi
+As quantized op schedules, these are only used if the qnn.Legalize pass is disabled. The current
+schedules only work for fused operators with bias, as this is the most common use case. Only
+regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""
+
+from tvm import topi, TVMError
 from .generic import qnn_conv2d_strategy
 from ... import op as _op
 from ...op.strategy.generic import is_depthwise_conv2d
 
 
 @qnn_conv2d_strategy.register("arm_cpu")
 def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
-    """qnn.conv2d strategy for Arm CPU. Currently, the schedules only support Cortex-M processors
-    with DSP - the qnn.Legalize pass should be run on all others."""
+    """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP."""
 
     if not (target.features.has_dsp and "cortex-m" in target.mcpu):
-        raise RuntimeError(
+        raise TVMError(
             "Quantized Arm schedules only exist for Cortex-M with DSP! "
             "The qnn.Legalize pass should be run for other Arm processors."
         )
@@ -57,6 +58,6 @@ def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
                 name="qnn_depthwise_conv2d.arm_cpu",
             )
     else:
-        raise RuntimeError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")
+        raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")
 
     return strategy
@@ -14,20 +14,25 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d for some data and kernel layouts.
-When for regular convolution, use data laout HHWC and kernel layout OHWI. For depthwise convolution,
-use data layout data layout is NCHW and kernel layout OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
+
+This function can be used to tensorize many common operators including regular conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout data layout is NCHW
+and kernel layout OIHW.
+"""
 
 from itertools import chain
 import textwrap
 from typing import Iterator, Tuple
 
 
 def _get_c_function_name(split_size, dimensions, offsets, x_strides):
-    """Gets the C function name of the tensordot function. We do not need a suffix, as the generated
-    function will have an #include guard. Unlike other microTVM operators, _get_c_function_name is
-    never called externally."""
+    """Generates a C function name for tensordot.
+
+    We do not need a suffix, as the generated function will have an #include guard. Unlike other
+    microTVM operators, _get_c_function_name is never called externally.
+    """
     tensor_w, kernel_h, kernel_w = dimensions
     return (
         f"tensordot_opt_x{split_size}_int16_w{tensor_w}_"
@@ -38,12 +43,16 @@ def _get_c_function_name(split_size, dimensions, offsets, x_strides):
 
 
 def _init_biased_accumulators(split_size):
-    """Addition is commutative, so we could add the bias before, during, or after performing our
-    multiply-accumulate operations. It "costs" one cycle either way - if done at the beginning we
-    can't use a SMULXY trick to set sum_i to zero for "free", and if done at the end it doesn't
-    combine with anything. However, doing it at the beginning frees up a register/prevents needing
-    to do a stack push/pop, so we'll do it first."""
-    assignments = map(lambda x: f"sum_{x:x} = bias", range(split_size))
+    """Generates code to load the bias into the accumulators.
+
+    Addition is commutative, so we could add the bias before, during, or after performing our
+    multiply-accumulate operations. Where we add the bias does not change the overflow behavior.
+
+    Doing the bias add takes one cycle either way (if done at the beginning we can't use a SMULXY
+    trick to set sum_i to zero for "free"). However, doing it at the beginning frees up a register,
+    so we'll do it first.
+    """
+    assignments = map(lambda x: f"sum_{x:x} = *bias", range(split_size))
     joined_assignments = ", ".join(assignments)
     return f"int {joined_assignments};"
 
@@ -98,12 +107,15 @@ def _load_kernel_vars(halfwords) -> Iterator[str]:
 
 
 def _get_draft_macs(kernel_dims, tensor_halfwords, kernel_halfwords, offset) -> Iterator[Tuple]:
-    """Generates a functional but un-optimized list of multiply-accumulate instructions that we will
-    optimize later. The tuples in the returned iterator are organized as:
+    """Generates an un-optimized list of multiply-accumulate instructions.
+
+    We will optimize these into SIMD instructions later. The tuples in the returned iterator are
+    organized as:
 
     (instruction, (arg1_y, arg1_x), (arg2_y, arg2_x))
 
-    We return an iterator so that optimizations may be done by iterator chaining."""
+    We return an iterator so that optimizations may be done by iterator chaining.
+    """
 
     def get_var(y, x, halfwords):
         i = halfwords.index((y, x))
@@ -121,9 +133,12 @@ def get_var(y, x, halfwords):
 
 
 def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
-    """Fuses single halfword MAC instructions into double halfword MAC instructions when possible.
+    """When possible, fuses single MACs into SIMD MAC instructions.
+
     The compiler cannot do this automatically, as calling __builtin_arm_smlaxy forces the SMLAxy
-    instruction to be used."""
+    instruction to be used. This function takes as input an iterator of (instruction, var1, var2)
+    tuples, and returns an iterator of (instruction, var1, var2) tuples.
+    """
     curr_tuple = next(instruction_tuples, None)
     while curr_tuple:
         next_tuple = next(instruction_tuples, None)
@@ -147,9 +162,10 @@ def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
 
 
 def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
-    """Converts a series of (instruction, var1, var2) tuples into lines of C code. We want the
-    compiler to re-order these with the memory loads, so we generate them as a series of calls to
-    instruction aliases instead of as a single `asm` block.
+    """Converts a series of (instruction, var1, var2) tuples into lines of C code.
+
+    We want the compiler to re-order these with the memory loads, so we generate them as a series of
+    calls to instruction aliases instead of as a single `asm` block.
     """
 
     for instruction, op1, op2 in instruction_tuples:
@@ -165,10 +181,13 @@ def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
             yield f"sum_{index} = __builtin_arm_{instruction}({op1}, {op2}, sum_{index});"
 
 
-def _requantize_sums(num_sums) -> Iterator[str]:
-    """Simulates multiplying by the float32 requantization scale by doing a int64 multiply + shift,
-    which is much faster. The bias is added at the beginning, so we can skip doing it now. The shift
-    is hard-coded, as this saves a few cycles without hurting accuracy in "most" cases.
+def _requantize_sums(num_sums, requantize_shift, output_zero_point) -> Iterator[str]:
+    """Generates code to requantize the accumulator values.
+
+    The generated code does not use floating point instructions, as it simulates floating point
+    multiplication with an a int64 multiply + shift. The bias is added at the beginning, so we can
+    skip doing it now. The shift is hard-coded, as this saves a few cycles without hurting accuracy
+    in "most" cases.
 
     It's *possible* we could save one more cycle here by pre-multiplying the bias with the
     requantize multiplier, and then doing the bias addition and shift in the same cycle (via <op2>).
@@ -180,22 +199,27 @@ def _requantize_sums(num_sums) -> Iterator[str]:
 
     Calling __builtin_arm_ssat directly is a little bit gross, but GCC and Clang are unreliable
     about compiling other ways of writing this. Both the multiply + shift and shift + saturation
-    combine to one instruction each."""
+    combine to one instruction each.
+    """
 
+    yield "int scale_val = *scale;"
     for i in range(num_sums):
-        yield f"int requant_{i} = (sum_{i} * (long long) requant_scale) >> 32;"
+        yield f"int requant_{i} = (sum_{i} * (long long) scale_val) >> {requantize_shift - 1};"
         yield f"requant_{i} = (requant_{i} + 1) >> 1;"
-        yield f"requant_{i} = __builtin_arm_ssat(requant_{i} - 128, 8);"
+        yield f"requant_{i} = __builtin_arm_ssat(requant_{i} + {output_zero_point}, 8);"
 
 
 def _write_sums_to_memory(num_sums, offset, stride) -> Iterator[str]:
-    """Writes the requantized sums to memory. Note - halfword packing here *does* help. It seems
+    """Generates code to write the requantized sums to memory.
+
+    Note - halfword packing here *does* help. It seems
     like it wouldn't, as doing two pipelined int16 stores takes two cycles - the same as halfword
     packing plus a pipelined int32 store. We still do the int16 stores when there is an output
     stride, though.
 
     However, this lets the compiler re-order instructions to better preserve memory, as it doesn't
-    like breaking apart the store instructions (as this messes up pipelining)."""
+    like breaking apart the store instructions (as this messes up pipelining).
+    """
 
     if stride > 1:
         for i in range(num_sums):
@@ -222,10 +246,14 @@ def tensordot_int16_impl(
     dimensions: Tuple[int, int, int],
     offsets: Tuple[int, int, int],
     x_strides: Tuple[int, int],
+    requantize_shift: int = 33,
+    output_zero_point: int = -128,
 ) -> Tuple[str, str]:
-    """Code for a quantized version of tensordot, which computes `split_size` tensordot operations
-    at the same time. Only works with `int16`. The generated function takes as input pointers to the
-    output, tensor, and kernel, which must be word-aligned.
+    """Generates code to compute a tensor dot product with requantization.
+
+    The generated function takes pointers to the output, tensor, and kernel as input. All pointers
+    must be word aligned. Only works with `int16` data type. The generated code is optimized for the
+    ARMv7E-M architecture.
 
     Parameters
     ----------
@@ -249,6 +277,14 @@ def tensordot_int16_impl(
         The distance (in halfwords) between the start of each input tensor, and where to write each
         output result respectively. Only used when split_size > 1.
 
+    requantize_shift: int
+        The distance to right shift after multiplying by the requantization scale. Defaults to 33,
+        as this lets us skip a shift operation.
+
+    outout_zero_point: int
+        The output zero point, which will be subtracted after scale multiplication but before
+        clipping. Defaults to -128, as most models always use this.
+
     Returns
     -------
     func_name, func_code: Tuple[str, str]
@@ -273,7 +309,9 @@ def gen_single_loop_macs(index):
         return _expand_instruction_tuples(draft_macs_iter, index)
 
     multiply_acc_lines = chain.from_iterable(gen_single_loop_macs(i) for i in range(split_size))
-    requantize_lines = _requantize_sums(split_size)
+    requantize_lines = _requantize_sums(
+        split_size, requantize_shift=requantize_shift, output_zero_point=output_zero_point
+    )
     write_out_lines = _write_sums_to_memory(split_size, output_offset, out_stride)
 
     def insert_lines(lines):
@@ -287,7 +325,7 @@ def insert_lines(lines):
         #ifndef {function_name.upper()}_EXISTS
         #define {function_name.upper()}_EXISTS
         __attribute__((always_inline)) static inline int {function_name}(
-            int *output, int *tensor, int *kernel, int bias, int requant_scale
+            int *output, int *tensor, int *kernel, int *bias, int *scale
         ) {{
           {_init_biased_accumulators(split_size)}