[microNPU] Cascader performance model bugfixes (#10510)

jacobbohlin · web-flow · commit 0b957802b00a · 2022-04-20T10:10:47.000+01:00
* [microNPU] Performance model bugfixes

* Fixed incorrect num_blocks calculations for both BufferModes.
* Fixed similar issues with Read/Write byte calculations.
* Fixed an issue where the 'partkernel' flag was not propagated to
  the performance estimation code.
* Fixed single buffering check incorrectly used output shape and
  block rather than the input shape and block.
* Fixed block config not aligned to micro block for Elementwise.

Change-Id: Ide6b231bc1a17c65bed20129d2179a215ada14b2

* Address review comment

Changed incorrect usage of 'max_width' to 'max_depth'.
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -288,7 +288,7 @@ def _get_input_block(
         input_shape: _Shape,
         dtype: str,
         op_type: str,
-        is_partkernel: bool,
+        partkernel: bool,
         stride_h: int,
         stride_w: int,
         dilated_kernel_h: int,
@@ -310,7 +310,7 @@ def _get_input_block(
 
         if op_type == "ethosu_conv2d":
             if dtype == "int8":
-                if is_partkernel:
+                if partkernel:
                     depth = self._align(min(32, input_shape.depth), 8)
                 else:
                     depth = self._align(min(16, input_shape.depth), 8)
@@ -336,7 +336,7 @@ def get_kernel_steps(
         dilated_kernel_h: int,
         dilated_kernel_w: int,
         ifm_dtype: str,
-        is_partkernel: bool = False,
+        partkernel: bool = False,
     ) -> List[int]:
         """Calculate the total number of subkernels and their sizes
 
@@ -351,7 +351,7 @@ def get_kernel_steps(
             Width of dilated kernel
         ifm_dtype: str
             Datatype of the Input Feature Map tensor (IFM)
-        is_partkernel: bool
+        partkernel: bool
             Flag showing whether part-kernel first traversal is used
 
         Returns
@@ -368,7 +368,7 @@ def get_kernel_steps(
         kernel_steps = []
         for y, x in subkernels:
             subkernel_elements = x * y
-            if op_type == "ethosu_conv2d" and is_partkernel:
+            if op_type == "ethosu_conv2d" and partkernel:
                 # Part-kernel-first traversal conv2d
                 divisor = 4 if ifm_dtype == "int8" else 2
                 kernel_steps.append(int(_round_up_div(subkernel_elements, divisor)))
@@ -509,29 +509,31 @@ def get_elementwise_block_config(
             banks_available -= 2
 
         # Split the block in half until it fits into SHRAM
+        max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
         if output_layout == "NHCWB16":
             split_order = (a for a in [1, 3, 2])
             output_block = [
                 output_shape[0],
-                min(output_shape[1], self._max_block_shape.height),
-                min(output_shape[2] * output_shape[4], self._max_block_shape.depth),
-                min(output_shape[3], self._max_block_shape.width),
+                _round_up(min(output_shape[1], max_height), self._micro_block.height),
+                min(output_shape[2] * output_shape[4], max_depth),
+                _round_up(min(output_shape[3], max_width), self._micro_block.width),
                 16,
             ]
         else:
             split_order = (a for a in [1, 2, 3])
             output_block = [
                 output_shape[0],
-                min(output_shape[1], self._max_block_shape.height),
-                min(output_shape[2], self._max_block_shape.width),
-                min(output_shape[3], self._max_block_shape.depth),
+                _round_up(min(output_shape[1], max_height), self._micro_block.height),
+                _round_up(min(output_shape[2], max_width), self._micro_block.width),
+                _round_up(min(output_shape[3], max_depth), self._micro_block.depth),
             ]
         split_axis = next(split_order)
+
+        offset = [0] * len(output_block)
+        stripes = [1] * len(output_block)
+        order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
         while True:
             # Create stripe config for output block
-            offset = [0] * len(output_block)
-            stripes = [1] * len(output_block)
-            order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
             output_stripe_config = StripeConfig(
                 output_block, output_block, output_block, order, stripes, offset
             )
@@ -564,10 +566,12 @@ def get_elementwise_block_config(
                 block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
                 break
 
-            if output_block[split_axis] == 1:
+            if output_block[split_axis] == self._micro_block.as_list()[split_axis]:
                 split_axis = next(split_order)
 
-            output_block[split_axis] = _round_up_div(output_block[split_axis], 2)
+            output_block[split_axis] = _round_up(
+                _round_up_div(output_block[split_axis], 2), self._micro_block.as_list()[split_axis]
+            )
 
         return block_config
 
@@ -670,9 +674,9 @@ def get_valid_block_configs(
 
         # Input block depth has additional limitations for operators that require full input depth
         input_block_depth = 0
-        is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
+        partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
         if op_type == "ethosu_conv2d":
-            if is_partkernel:
+            if partkernel:
                 input_block_depth = min(ifm_channels, 16)
             else:
                 input_block_depth = min(ifm_channels, 32)
@@ -745,7 +749,8 @@ def get_valid_block_configs(
                             kernel_h,
                             kernel_w,
                             ifm_channels,
-                            is_partkernel,
+                            "int8",
+                            partkernel,
                         )
                         block_config = BlockConfig(
                             input_block_shape.as_list(), output_block, compute_cycles, output_cycles
@@ -767,15 +772,15 @@ def _estimate_compute_cycles_per_block(
         kernel_w: int,
         input_channels: int,
         ifm_dtype: str,
-        is_partkernel: bool = False,
+        partkernel: bool = False,
     ) -> Tuple[int, int]:
         # Calculate the amount of micro blocks per block, per axis
         num_quantum_x = _round_up_div(block_shape.width, self._micro_block.width)
         num_quantum_y = _round_up_div(block_shape.height, self._micro_block.height)
         num_quantum_z = _round_up_div(block_shape.depth, self._micro_block.depth)
         num_quantum_xy = num_quantum_x * num_quantum_y
 
-        kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, is_partkernel)
+        kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, partkernel)
 
         wd_cycles = self._get_weight_decoder_cycles(op_type)
         delay_cycles = self._get_delay_cycles(op_type, ifm_dtype)
@@ -794,7 +799,7 @@ def _estimate_compute_cycles_per_block(
                 elif subkernel_steps > 1:
                     compute_cycles += delay_cycles * (subkernel_steps - 1) * num_quantum_z
 
-        if is_partkernel:
+        if partkernel:
             compute_cycles *= _round_up_div(input_block_shape.depth, 8)
 
         if op_type == "ethosu_conv2d":
diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc
@@ -74,6 +74,8 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri
   BlockConfig best_block_config;
   float best_cost = std::numeric_limits<float>::infinity();
   std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
+  auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
+  std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
 
   for (const auto& block_config : valid_block_configs_) {
     std::vector<int> output_block = block_config->GetOutputBlockShape();
@@ -86,7 +88,7 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri
                           mul_reduce(output_stripe_shape);
 
     // Single buffering hardware optimization
-    if (mul_reduce(output_stripe_shape) <= 2 * mul_reduce(output_block)) {
+    if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
       relative_cost /= 2;
     }
 
@@ -107,25 +109,25 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out
   std::vector<int64_t> bytes_per_input =
       GetBytesRead(block_shape, output_stripe_config->GetShape());
 
-  int elements_per_block = mul_reduce(block_shape);
-  int bytes_per_output = elements_per_block;
   float num_blocks = 1.0f;
   for (size_t i = 0; i < block_shape.size(); i++) {
     if (buffer_mode == BufferMode::RECOMPUTE) {
-      num_blocks *= static_cast<float>(output_stripe_config->GetShape()[i] *
-                                       output_stripe_config->GetStripes()[i]) /
-                    block_shape[i];
+      num_blocks *= std::max(static_cast<float>(output_stripe_config->GetShape()[i]) /
+                                 block_shape[i] * output_stripe_config->GetStripes()[i],
+                             1.0f);
     } else {
       num_blocks *=
-          std::max(static_cast<float>(output_stripe_config->GetExtent()[i]) / block_shape[i], 1.0f);
+          std::max(static_cast<float>(output_tensor_->GetShape()[i]) / block_shape[i], 1.0f);
     }
   }
-  float num_stripes = mul_reduce(output_stripe_config->GetStripes()) - 1.0f;
+
+  float num_stripes = mul_reduce(output_stripe_config->GetStripes());
   std::vector<int64_t> read_bytes;
-  for (int block_bytes : bytes_per_input) {
-    read_bytes.push_back((num_blocks + num_stripes) * block_bytes);
+  for (int64_t stripe_bytes : bytes_per_input) {
+    read_bytes.push_back(num_stripes * stripe_bytes);
   }
-  int64_t write_bytes = (num_blocks + num_stripes) * bytes_per_output;
+  int64_t write_bytes =
+      num_blocks * mul_reduce(block_shape) * output_tensor_->GetDataType().bytes();
 
   int block_output_cycles = block_config->GetOutputCycles();
   int block_compute_cycles = block_config->GetComputeCycles();
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -207,7 +207,7 @@
                 ((1, 7, 10, 16), (1, 7, 1, 10, 16)),
                 ((1, 7, 6, 16), (1, 7, 1, 6, 16)),
                 # Pooling
-                ((1, 1, 2, 80), (1, 1, 5, 2, 16)),
+                ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
                 ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
             ],
         ),
@@ -225,7 +225,7 @@
                 ((1, 8, 20, 16), (1, 8, 1, 20, 16)),
                 ((1, 14, 6, 16), (1, 14, 1, 6, 16)),
                 # Pooling
-                ((1, 2, 2, 48), (1, 2, 3, 2, 16)),
+                ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
                 ((1, 10, 12, 16), (1, 10, 1, 12, 16)),
             ],
         ),
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
@@ -47,6 +47,8 @@ def test_ethosu_part():
     )
     input_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")
     part.set_input(0, input_tensor)
+    output_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")
+    part.set_output(output_tensor)
 
     assert part.get_stripe_align_hint() == output_quantum
     # Check that the performance model runs, don't verify output
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
@@ -216,6 +216,7 @@ def test_conv_performance(
     )
     part.set_input(0, cs.Tensor(in_shape, "int8"))
     part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8"))
+    part.set_output(cs.Tensor(out_shape, "int8"))
 
     stripes = [1] * len(output_quantum)
     offset = [0] * len(output_quantum)

Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,8 @@ def test_ethosu_part():`
`47`	`47`	`)`
`48`	`48`	`input_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")`
`49`	`49`	`part.set_input(0, input_tensor)`
	`50`	`+ output_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")`
	`51`	`+ part.set_output(output_tensor)`
`50`	`52`
`51`	`53`	`assert part.get_stripe_align_hint() == output_quantum`
`52`	`54`	`# Check that the performance model runs, don't verify output`
Original file line number	Diff line number	Diff line change
`@@ -216,6 +216,7 @@ def test_conv_performance(`
`216`	`216`	`)`
`217`	`217`	`part.set_input(0, cs.Tensor(in_shape, "int8"))`
`218`	`218`	`part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8"))`
	`219`	`+ part.set_output(cs.Tensor(out_shape, "int8"))`
`219`	`220`
`220`	`221`	`stripes = [1] * len(output_quantum)`
`221`	`222`	`offset = [0] * len(output_quantum)`