Formatting fixes

guberti · guberti · commit 3237b35f9d62 · 2022-11-07T02:38:16.000-08:00
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -38,6 +38,7 @@ def get_c_function_name(split_size, dimensions, offsets, x_strides):
         + (f"_{x_strides[0]}_{x_strides[1]}" if split_size > 1 else "")
     )
 
+
 def _is_pow_2(number):
     """Checks if `number` is a power of `2`."""
     return number & (number - 1) == 0 and number > 0
@@ -160,11 +161,11 @@ def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]:
 }
 
 
-#def _no_first_accumulate(instruction_tuples) -> Iterator[Tuple]:
-#    ins, op1, op2 = next(instruction_tuples)
-#    yield NO_ACC_PREFIX_CONVERSIONS[ins], op1, op2
-#    for instruction_tuple in instruction_tuples:
-#        yield instruction_tuple
+# def _no_first_accumulate(instruction_tuples) -> Iterator[Tuple]:
+#     ins, op1, op2 = next(instruction_tuples)
+#     yield NO_ACC_PREFIX_CONVERSIONS[ins], op1, op2
+#     for instruction_tuple in instruction_tuples:
+#         yield instruction_tuple
 
 
 def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
@@ -205,6 +206,7 @@ def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
         else:
             yield f'asm ("{instruction} %0, %1, %2" : "=r" (sum_{index}) : "r" ({op1}), "r" ({op2}));'
 
+
 def _requantize_sums(num_sums) -> Iterator[str]:
     """Simulates multiplying by the float32 requantization scale by doing a int64 multiply + shift,
     which is much faster. The bias is added at the beginning, so we can skip doing it now. The shift
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -154,7 +154,6 @@ def _get_quant_zp_const(quantization_dict, as_scalar = False):
 def test_qnn_conv2d_mobilenetv1_layer(layer, interpreter):
     in_dtype = "int8"
     schedule_name, dtype, padding, strides = _get_layer_attributes(layer)
-
     """Load the input, kernel, bias, and generated output from each layer when it was run by the
     TensorFlow TFLite interpreter. The tensor values are quantized (though note that biases_tensor
     is an int32), while the quantization data is not. Note the zero points are zero everywhere
@@ -163,10 +162,13 @@ def test_qnn_conv2d_mobilenetv1_layer(layer, interpreter):
     def lookup(detail):
         return interpreter.get_tensor(detail["index"]), detail["quantization_parameters"]
     inputs_tensor, inputs_quant = lookup(_get_main_path_tensor_details(tensor_details, layer))
+    print(inputs_tensor.shape)
     kernel_tensor, kernel_quant = lookup(_get_kernel_details(tensor_details, layer))
+    print(kernel_tensor.shape)
     biases_tensor, biases_quant = lookup(_get_bias_details(tensor_details, layer))
+    print(biases_tensor.shape)
     output_tensor, output_quant = lookup(_get_main_path_tensor_details(tensor_details, layer + 1))
-
+    out_channel_multiplier, kernel_h, kernel_w, in_channels = kernel_tensor.shape
 
     # Reshape tensors to match the layouts we will see after legalization
     if layer % 2 == 0: # Regular conv2d
@@ -187,15 +189,15 @@ def lookup(detail):
         kernel_zero_point=_get_quant_zp_const(kernel_quant),
         input_scale=_get_quant_scale_const(inputs_quant, as_scalar=True),
         kernel_scale=_get_quant_scale_const(kernel_quant),
-        kernel_size=(3, 3),
+        kernel_size=(kernel_h, kernel_w),
         data_layout=new_inputs_layout,
         kernel_layout=new_kernel_layout,
 
         dilation=(1, 1),
         strides=strides,
         padding=padding,
-        groups=(1 if layer % 2 == 0 else 3),
-        channels=8,
+        groups=(1 if layer % 2 == 0 else in_channels),
+        channels=(out_channel_multiplier if layer % 2 == 0 else in_channels),
         out_dtype="int32",
     )
 
@@ -219,7 +221,7 @@ def lookup(detail):
     test_model = AOTTestModel(
         module=tvm.IRModule.from_expr(test_function),
         inputs={"input": inputs_ndarr},
-        outputs={"Identity": output_ndarr},
+        outputs={"output": output_ndarr},
     )
 
     compile_and_run(
@@ -229,7 +231,7 @@ def lookup(detail):
         use_unpacked_api=True,
         target_opts={
             "-keys": "arm_cpu",
-            "-mcpu": "cortex-m4",
+            "-mcpu": "cortex-m7",
         },
         schedule_name=schedule_name,
         verbose=True,
diff --git a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
@@ -29,8 +29,13 @@
 
 
 def test_write_3x3_depthwise_code():
-    """
+    """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME"
+    padding. We are only computing one sum at once, so we don't need stride or output. Note that
+    this is pretty inefficient - it would be much better to compute a few sums concurrently.
+
+    When inlined, this code compiles (with armv7-a clang 11) into:
 
+    tensordot_opt_x1_int16_w48_3x3_000(int*, int*, int*, int*, int*):
         ldr.w   lr, [r3]
         ldrd    r11, r4, [r1]
         ldrd    r5, r9, [r1, #96]
@@ -225,7 +230,6 @@ def test_1x1x8_convolution_code():
     )
 
 
-
 def test_3x3x3_offset_convolution_code():
     """This is the function that would be generated for a 1x96x96x3 NHWC input tensor under
     standard convolution with a 3x3x3 kernel - the first layer of MobileNetV1. This is special, as
@@ -304,4 +308,4 @@ def test_3x3x3_offset_convolution_code():
       return 0;
     }
     """
-    )
+    )