apache
diff --git a/‎python/tvm/contrib/ethosu/cascader/device_config.py‎
Lines changed: 48 additions & 32 deletions b/‎python/tvm/contrib/ethosu/cascader/device_config.py‎
Lines changed: 48 additions & 32 deletions
diff --git a/‎python/tvm/relay/backend/contrib/ethosu/te/common.py‎
Lines changed: 4 additions & 1 deletion b/‎python/tvm/relay/backend/contrib/ethosu/te/common.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/contrib/ethosu/cascader/parts/ethosu.cc‎
Lines changed: 22 additions & 15 deletions b/‎src/contrib/ethosu/cascader/parts/ethosu.cc‎
Lines changed: 22 additions & 15 deletions
diff --git a/‎src/contrib/ethosu/cascader/parts/ethosu.h‎
Lines changed: 8 additions & 0 deletions b/‎src/contrib/ethosu/cascader/parts/ethosu.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/target/source/interface_c.cc‎
Lines changed: 6 additions & 6 deletions b/‎src/target/source/interface_c.cc‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tests/cpp/target/source/interface_c_test.cc‎
Lines changed: 27 additions & 0 deletions b/‎tests/cpp/target/source/interface_c_test.cc‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py‎
Lines changed: 20 additions & 20 deletions b/‎tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py‎
Lines changed: 20 additions & 20 deletions
@@ -84,7 +84,7 @@ def __init__(self, device: str, disable_block_bulling: bool = False):
 
             self._total_banks = 48
             self._reserved_banks = 4
-            self._input_granularity = 8
+            self._input_granularity = {1: 8, 2: 8, 4: 16}
             self._accumulator_granularity = {4: 16, 5: 20}
             self._lut_reserved = True
         elif self._device == "ethos-u55-128":
@@ -96,7 +96,7 @@ def __init__(self, device: str, disable_block_bulling: bool = False):
 
             self._total_banks = 24
             self._reserved_banks = 4
-            self._input_granularity = 4
+            self._input_granularity = {1: 4, 2: 4, 4: 8}
             self._accumulator_granularity = {4: 8, 5: 12}
             self._lut_reserved = True
         elif self._device == "ethos-u55-64":
@@ -108,7 +108,7 @@ def __init__(self, device: str, disable_block_bulling: bool = False):
 
             self._total_banks = 16
             self._reserved_banks = 2
-            self._input_granularity = 2
+            self._input_granularity = {1: 2, 2: 2, 4: 4}
             self._accumulator_granularity = {4: 4, 5: 8}
             self._lut_reserved = False
         elif self._device == "ethos-u55-32":
@@ -120,8 +120,8 @@ def __init__(self, device: str, disable_block_bulling: bool = False):
 
             self._total_banks = 16
             self._reserved_banks = 2
-            self._input_granularity = 2
-            self._accumulator_granularity = {4: 4, 5: 8}
+            self._input_granularity = {1: 2, 2: 2, 4: 4}
+            self._accumulator_granularity = {4: 4, 5: 4}
             self._lut_reserved = False
 
     def _get_output_cycles(
@@ -448,18 +448,32 @@ def _get_input_banks(self, input_block_shape, input_bytewidth):
             input_block_shape.depth * input_bytewidth, 8
         )
         input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-        input_banks = _round_up(input_banks, self._input_granularity)
+        input_banks = _round_up(input_banks, self._input_granularity[input_bytewidth])
 
         return input_banks
 
-    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
-        acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
+    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth):
+        acc_depth = _round_up(output_block_shape.depth, 8)
         acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
         acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
         acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
 
         return acc_banks
 
+    @staticmethod
+    def _create_layout_block(nhwc_block_config, layout):
+        """A helper function to convert to brick layout"""
+        if layout == "NHCWB16":
+            return [
+                nhwc_block_config[0],
+                nhwc_block_config[1],
+                1 + ((nhwc_block_config[3] - 1) // 16),
+                nhwc_block_config[2],
+                16,
+            ]
+        # else it could only be NHWC
+        return nhwc_block_config
+
     def get_elementwise_block_config(
         self,
         ifm_propagator: Propagator,
@@ -537,22 +551,22 @@ def get_elementwise_block_config(
         # Split the block in half until it fits into SHRAM
         max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
         if output_layout == "NHCWB16":
-            split_order = (a for a in [1, 3, 2])
-            output_block = [
-                output_shape[0],
-                _round_up(min(output_shape[1], max_height), self._micro_block.height),
-                min(output_shape[2] * output_shape[4], max_depth),
-                _round_up(min(output_shape[3], max_width), self._micro_block.width),
-                16,
-            ]
+            output_height = output_shape[1]
+            output_width = output_shape[3]
+            output_channels = output_shape[2] * 16
         else:
-            split_order = (a for a in [1, 2, 3])
-            output_block = [
-                output_shape[0],
-                _round_up(min(output_shape[1], max_height), self._micro_block.height),
-                _round_up(min(output_shape[2], max_width), self._micro_block.width),
-                _round_up(min(output_shape[3], max_depth), self._micro_block.depth),
-            ]
+            output_height = output_shape[1]
+            output_width = output_shape[2]
+            output_channels = output_shape[3]
+
+        output_nhwc_block = [
+            1,
+            _round_up(min(output_height, max_height), self._micro_block.height),
+            _round_up(min(output_width, max_width), self._micro_block.width),
+            _round_up(min(output_channels, max_depth), self._micro_block.depth),
+        ]
+        output_block = self._create_layout_block(output_nhwc_block, output_layout)
+        split_order = (a for a in [1, 2, 3])
         split_axis = next(split_order)
 
         offset = [0] * len(output_block)
@@ -572,7 +586,7 @@ def get_elementwise_block_config(
                 )
             else:
                 # Unary elementwise
-                input2_block = _Shape([0, 0, 0, 0])
+                input2_block = input_block
 
             input_block.round_up(self._input_micro_block)
             input2_block.round_up(self._input_micro_block)
@@ -589,15 +603,19 @@ def get_elementwise_block_config(
                 )
                 output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
                 output_cycles = int(math.ceil(output_cycles))
-                block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
+                block_config.append(
+                    BlockConfig(input_block.as_list(), output_block, 0, output_cycles)
+                )
                 break
 
-            if output_block[split_axis] == self._micro_block.as_list()[split_axis]:
+            if output_nhwc_block[split_axis] == self._micro_block.as_list()[split_axis]:
                 split_axis = next(split_order)
 
-            output_block[split_axis] = _round_up(
-                _round_up_div(output_block[split_axis], 2), self._micro_block.as_list()[split_axis]
+            output_nhwc_block[split_axis] = _round_up(
+                _round_up_div(output_nhwc_block[split_axis], 2),
+                self._micro_block.as_list()[split_axis],
             )
+            output_block = self._create_layout_block(output_nhwc_block, output_layout)
 
         return block_config
 
@@ -739,7 +757,7 @@ def get_valid_block_configs(
                             height,
                             1 + ((depth - 1) // 16),
                             width,
-                            min(16, _round_up(ofm_channels, self._micro_block.depth)),
+                            16,
                         )
                         order = [1, 2, 4, 3, 0]
                     else:
@@ -771,9 +789,7 @@ def get_valid_block_configs(
                     # Banks required for input block
                     input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
                     # Banks required for accumulation
-                    acc_banks = self._get_accumulator_banks(
-                        output_block_shape, acc_bytewidth, depth
-                    )
+                    acc_banks = self._get_accumulator_banks(output_block_shape, acc_bytewidth)
 
                     if (input_banks + acc_banks) <= banks_available:
                         output_cycles = self._get_output_cycles(
 
@@ -53,7 +53,10 @@ def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]],
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 0, ofm_channels],
+        # We need to offset only if number of ofm_channels is not divisible by 16
+        # Moreover, we can't use just the "ofm_channels" as last element because
+        # the propogation matrices are used to propogate block configs as well.
+        [0, 0, 16, 0, 0, -(int(ofm_channels % 16 != 0)) * (16 - ofm_channels % 16)],
         [0, 0, 0, 0, 0, 1],
     ]
 
 
@@ -70,34 +70,41 @@ const std::vector<int64_t> EthosuPartNode::GetBytesRead(const std::vector<int>&
   return bytes_per_input;
 }
 
-const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stripe_config) {
-  BlockConfig best_block_config;
-  float best_cost = std::numeric_limits<float>::infinity();
+float EthosuPartNode::CalculateCost(const BlockConfig& block_config,
+                                    const StripeConfig& output_stripe_config) {
+  std::vector<int> output_block = block_config->GetOutputBlockShape();
   std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
   auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
   std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
 
-  for (const auto& block_config : valid_block_configs_) {
-    std::vector<int> output_block = block_config->GetOutputBlockShape();
+  std::vector<int64_t> bytes_per_input = GetBytesRead(output_block, output_stripe_shape);
+  bytes_per_input[0] *= subkernels_;
 
-    std::vector<int64_t> bytes_per_input = GetBytesRead(output_block, output_stripe_shape);
-    bytes_per_input[0] *= subkernels_;
+  // Calculate bytes read per output element
+  float cost =
+      static_cast<float>(bytes_per_input[0] + bytes_per_input[1]) / mul_reduce(output_stripe_shape);
 
-    // Calculate bytes read per output element
-    float relative_cost = static_cast<float>(bytes_per_input[0] + bytes_per_input[1]) /
-                          mul_reduce(output_stripe_shape);
+  // Single buffering hardware optimization
+  if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
+    cost /= 2;
+  }
+  return cost;
+}
 
-    // Single buffering hardware optimization
-    if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
-      relative_cost /= 2;
-    }
+const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stripe_config) {
+  BlockConfig best_block_config = valid_block_configs_[0];
+  float best_cost = CalculateCost(best_block_config, output_stripe_config);
+  std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
+  auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
+  std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
 
+  for (const auto& block_config : valid_block_configs_) {
+    float relative_cost = CalculateCost(block_config, output_stripe_config);
     if (relative_cost < best_cost) {
       best_block_config = block_config;
       best_cost = relative_cost;
     }
   }
-
   return best_block_config;
 }
 
 
@@ -75,6 +75,14 @@ class EthosuPartNode : public PartNode {
   const std::vector<int64_t> GetBytesRead(const std::vector<int>& block_shape,
                                           const std::vector<int>& full_shape);
 
+  /*!
+   * \brief Get cost heuristic of using a given block config with the associated stripe config
+   * \param block_config The block config that is being checked for the cost
+   * \param output_stripe_config The striping configuration associated with the operator
+   * \return A cost heuristic representative of the choice
+   */
+  float CalculateCost(const BlockConfig& block_config, const StripeConfig& output_stripe_config);
+
   /*! \brief List of block configs that are valid for this part */
   std::vector<BlockConfig> valid_block_configs_;
   /*! \brief The output volume that is atomically computed */
 
@@ -167,12 +167,12 @@ class InterfaceCNode : public runtime::ModuleNode {
       code_stream << " * \\param outputs Output tensors for the module \n";
     }
 
-    if (!devices_.empty()) {
-      code_stream << " * \\param devices Device context pointers for the module \n";
-    }
     if (!pools_.empty()) {
       code_stream << " * \\param workspace_pools Workspace memory pool pointers for the module \n";
     }
+    if (!devices_.empty()) {
+      code_stream << " * \\param devices Device context pointers for the module \n";
+    }
 
     code_stream << " */\n"
                 << "int32_t " << run_function << "(\n";
@@ -182,12 +182,12 @@ class InterfaceCNode : public runtime::ModuleNode {
       call_args_ss << "  struct " << inputs_struct << "* inputs,\n";
       call_args_ss << "  struct " << outputs_struct << "* outputs,\n";
     }
-    if (!devices_.empty()) {
-      call_args_ss << "  struct " << devices_struct << "* devices,\n";
-    }
     if (!pools_.empty()) {
       call_args_ss << "  struct " << pools_struct << "* workspace_pools,\n";
     }
+    if (!devices_.empty()) {
+      call_args_ss << "  struct " << devices_struct << "* devices,\n";
+    }
     std::string call_args_str = call_args_ss.str();
     call_args_str.pop_back();
     call_args_str.pop_back();
 
@@ -126,6 +126,33 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePools) {
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
 }
 
+TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePoolsAndDevices) {
+  std::stringstream run_function;
+
+  run_function << "/*!\n"
+               << " * \\brief entrypoint function for TVM module \"ultimate_cat_spotter\"\n"
+               << " * \\param inputs Input tensors for the module \n"
+               << " * \\param outputs Output tensors for the module \n"
+               << " * \\param workspace_pools Workspace memory pool pointers for the module \n"
+               << " * \\param devices Device context pointers for the module \n"
+               << " */\n"
+               << "int32_t tvmgen_ultimate_cat_spotter_run(\n"
+               << "  struct tvmgen_ultimate_cat_spotter_inputs* inputs,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_outputs* outputs,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_devices* devices\n"
+               << ");\n";
+
+  PoolInfo pool_info = PoolInfo("my_memory_pool", {});
+  tir::usmp::AllocatedPoolInfo allocated_pool_info =
+      tir::usmp::AllocatedPoolInfo(pool_info, 100000);
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info}, {}, {"device"}, 0);
+  std::string header_source = test_module->GetSource();
+
+  ASSERT_THAT(header_source, HasSubstr(run_function.str()));
+}
+
 TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceIO) {
   std::stringstream run_function_with_map_functions;
 
 
@@ -166,14 +166,14 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
+                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
-                ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
+                ((1, 6, 10, 16), (1, 4, 1, 12, 16)),
+                ((1, 8, 5, 16), (1, 6, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
-                ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
+                ((1, 1, 1, 128), (1, 1, 4, 1, 16)),
+                ((1, 9, 6, 16), (1, 8, 1, 4, 16)),
             ],
         ),
         (
@@ -184,14 +184,14 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
+                ((1, 10, 6, 8), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
-                ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
+                ((1, 6, 10, 16), (1, 4, 1, 12, 16)),
+                ((1, 8, 5, 16), (1, 6, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
-                ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
+                ((1, 1, 1, 128), (1, 1, 4, 1, 16)),
+                ((1, 9, 6, 16), (1, 8, 1, 4, 16)),
             ],
         ),
         (
@@ -202,15 +202,15 @@
                 ((1, 5, 8, 16), (1, 5, 1, 8, 16)),
                 ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 16, 4, 16), (1, 16, 1, 4, 16)),
-                ((1, 8, 12, 8), (1, 8, 1, 12, 8)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 8, 12, 8), (1, 10, 1, 6, 16)),
+                ((1, 10, 6, 16), (1, 10, 1, 6, 16), (1, 6, 1, 6, 16)),
                 # Depthwise Conv2D
-                ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 7, 2, 10, 16)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 6, 1, 10, 16)),
+                ((1, 10, 6, 16), (1, 10, 1, 6, 16), (1, 6, 1, 6, 16)),
                 # Pooling
                 # ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
-                ((1, 1, 2, 128), (1, 1, 8, 2, 16)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 1, 2, 128), (1, 1, 4, 2, 16)),
+                ((1, 10, 6, 16), (1, 9, 1, 6, 16)),
             ],
         ),
         (
@@ -221,14 +221,14 @@
                 ((1, 16, 8, 16), (1, 16, 1, 8, 16)),
                 ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 32, 4, 16), (1, 10, 12, 16), (1, 32, 1, 4, 16), (1, 10, 1, 12, 16)),
-                ((1, 20, 12, 8), (1, 20, 1, 12, 8)),
+                ((1, 20, 12, 8), (1, 10, 1, 12, 16)),
                 ((1, 12, 10, 16), (1, 12, 1, 10, 16)),
                 # Depthwise Conv2D
-                ((1, 8, 20, 16), (1, 8, 1, 20, 16), (1, 8, 2, 20, 16)),
-                ((1, 14, 6, 16), (1, 14, 1, 6, 16)),
+                ((1, 8, 20, 16), (1, 6, 1, 20, 16), (1, 6, 2, 20, 16)),
+                ((1, 14, 6, 16), (1, 12, 1, 6, 16)),
                 # Pooling
                 # ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
-                ((1, 2, 2, 128), (1, 2, 8, 2, 16)),
+                ((1, 2, 2, 128), (1, 2, 6, 2, 16)),
                 ((1, 10, 12, 16), (1, 10, 1, 12, 16)),
             ],
         ),