[microNPU] Fix layout transform matrix

ekalda · ekalda · commit 23685d973329 · 2022-03-24T13:59:33.000Z
One of the layout transforms currently causes the cascader to stripe
across B16 axis (which is not allowed), so change that and deal with
the implications to the get_valid_block_configs.

Change-Id: I04199f9f35fcc31618581567483cfb80d3b5aad2
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -439,6 +439,23 @@ def is_partkernel(
 
         return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8
 
+    def _get_input_banks(self, input_block_shape, input_bytewidth):
+        input_bytes = input_block_shape.area() * self._align(
+            input_block_shape.depth * input_bytewidth, 8
+        )
+        input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
+        input_banks = _round_up(input_banks, self._input_granularity)
+
+        return input_banks
+
+    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
+        acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
+        acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
+        acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
+        acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
+
+        return acc_banks
+
     def get_elementwise_block_config(
         self,
         ifm_propagator: Propagator,
@@ -533,16 +550,9 @@ def get_elementwise_block_config(
             input2_block.round_up(self._input_micro_block)
 
             # Banks required for input block
-            input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
-            input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-            input_banks = _round_up(input_banks, self._input_granularity)
-
+            input_banks = self._get_input_banks(input_block, input_bytewidth)
             # Banks required for input2 block
-            input2_bytes = input2_block.area() * self._align(
-                input2_block.depth * input_bytewidth, 8
-            )
-            input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
-            input2_banks = _round_up(input2_banks, self._input_granularity)
+            input2_banks = self._get_input_banks(input2_block, input_bytewidth)
 
             # Check whether or not both IFMs fit into SHRAM
             if (input_banks + input2_banks) <= banks_available:
@@ -561,6 +571,29 @@ def get_elementwise_block_config(
 
         return block_config
 
+    def _get_subkernel_propagator(
+        self, op_attrs, ifm_propagator, input_layout, output_layout, depth
+    ):
+        op_type = op_attrs.get("op")
+        stride_h = int(op_attrs.get("stride_h", 1))
+        stride_w = int(op_attrs.get("stride_w", 1))
+        transform = ifm_propagator.transform
+
+        if input_layout == "NHCWB16":
+            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+            transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
+        else:
+            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+            transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
+
+        if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
+            if output_layout == "NHCWB16" and input_layout == "NHWC":
+                transform[3][-1] = depth
+            elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
+                transform[2][-1] = depth // 16
+
+        return Propagator(transform, ifm_propagator.offset)
+
     def get_valid_block_configs(
         self,
         ifm_propagator: Propagator,
@@ -612,33 +645,13 @@ def get_valid_block_configs(
         op_type = op_attrs.get("op")
         op_str = op_attrs.get("op_str")
         activation = op_attrs.get("activation", "NONE")
-        stride_h = int(op_attrs.get("stride_h", 1))
-        stride_w = int(op_attrs.get("stride_w", 1))
         upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2
 
-        subkernel_transform = ifm_propagator.transform
         if output_layout == "NHCWB16":
             output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
         else:
             output_shape = _Shape(ofm_shape)
 
-        if input_layout == "NHCWB16":
-            subkernel_transform[1][-1] = min(
-                subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
-            )
-            subkernel_transform[3][-1] = min(
-                subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
-            )
-        else:
-            subkernel_transform[1][-1] = min(
-                subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
-            )
-            subkernel_transform[2][-1] = min(
-                subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
-            )
-
-        subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)
-
         # Define search space
         max_height = min(output_shape.height, self._max_block_shape.height)
         min_height = max(self._micro_block.height, upscaling_factor)
@@ -655,7 +668,7 @@ def get_valid_block_configs(
         if activation == "LUT" and not self._lut_reserved:
             banks_available -= 2
 
-        # Input block depth has additional limitations for Operators that require full input depth
+        # Input block depth has additional limitations for operators that require full input depth
         input_block_depth = 0
         is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
         if op_type == "ethosu_conv2d":
@@ -669,6 +682,10 @@ def get_valid_block_configs(
                 # Block depth has to be less than full depth or a multiple of the split depth
                 continue
 
+            subkernel_propagator = self._get_subkernel_propagator(
+                op_attrs, ifm_propagator, input_layout, output_layout, depth
+            )
+
             for width in range(min_width, max_width + min_width, min_width):
                 for height in range(min_height, max_height + min_height, min_height):
                     if output_layout == "NHCWB16":
@@ -709,19 +726,11 @@ def get_valid_block_configs(
                         input_block_shape.depth = input_block_depth
 
                     # Banks required for input block
-                    input_bytes = input_block_shape.area() * self._align(
-                        input_block_shape.depth * input_bytewidth, 8
-                    )
-                    input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-                    input_banks = _round_up(input_banks, self._input_granularity)
-
+                    input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
                     # Banks required for accumulation
-                    acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
-                    acc_bytes = (
-                        output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
+                    acc_banks = self._get_accumulator_banks(
+                        output_block_shape, acc_bytewidth, depth
                     )
-                    acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
-                    acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
 
                     if (input_banks + acc_banks) <= banks_available:
                         output_cycles = self._get_output_cycles(
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
@@ -187,7 +187,7 @@ def conv2d_compute(
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
+        [0, 0, 0, 0, 0, ofm_channels],
         [0, 0, 0, 0, 0, 1],
     ]
     ifm_matrix = [
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
@@ -181,7 +181,7 @@ def depthwise_conv2d_compute(
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
+        [0, 0, 0, 0, 0, channels],
         [0, 0, 0, 0, 0, 1],
     ]
     ifm_matrix = [
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
@@ -169,7 +169,7 @@ def pooling_compute(
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
+        [0, 0, 0, 0, 0, int(ofm_channels)],
         [0, 0, 0, 0, 0, 1],
     ]
     ifm_matrix = [
diff --git a/src/contrib/ethosu/cascader/block_config.cc b/src/contrib/ethosu/cascader/block_config.cc
@@ -37,15 +37,17 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("_input_shape", &tmp_arr);
   tmp_arr = make_array(output_shape_);
   v->Visit("_output_shape", &tmp_arr);
+  v->Visit("_compute_cycles", &compute_cycles_);
+  v->Visit("_output_cycles", &output_cycles_);
 }
 
 BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
                          int compute_cycles, int output_cycles) {
   auto n = make_object<BlockConfigNode>();
   n->input_shape_ = std::move(input_shape);
   n->output_shape_ = std::move(output_shape);
-  n->compute_cycles_ = compute_cycles;
-  n->output_cycles_ = output_cycles;
+  n->compute_cycles_ = std::move(compute_cycles);
+  n->output_cycles_ = std::move(output_cycles);
   data_ = std::move(n);
 }
 
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -64,7 +64,15 @@ def create_te_graph(func):
         return te_graph, consts
 
     def make_matrices(
-        op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1
+        op_type,
+        kernel,
+        stride,
+        padding,
+        ifm_layout,
+        ofm_layout,
+        dilation=(1, 1),
+        ifm_channels=1,
+        ofm_channels=1,
     ):
         kernel_h, kernel_w = kernel
         stride_h, stride_w = stride
@@ -83,7 +91,7 @@ def make_matrices(
             [1, 0, 0, 0, 0, 0],
             [0, 1, 0, 0, 0, 0],
             [0, 0, 0, 1, 0, 0],
-            [0, 0, 16, 0, 1, -16],
+            [0, 0, 0, 0, 0, ofm_channels],
             [0, 0, 0, 0, 0, 1],
         ]
         if op_type == "ethosu_conv2d":
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -164,7 +164,7 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 16, 1, 4, 4)),
+                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -182,7 +182,7 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 8), (1, 16, 1, 4, 8)),
+                ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -252,20 +252,22 @@ def test_best_block_config(
         [0, 0, 0, 0, 16],
         [0, 0, 0, 0, 1],
     ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
-    ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
-        op_type, kernel, stride, padding, layouts[0], layouts[1], dilation, in_shape[3]
-    )
 
     ofm_channels = out_shape[3]
     ifm_channels = in_shape[3]
 
+    ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
+        op_type,
+        kernel,
+        stride,
+        padding,
+        layouts[0],
+        layouts[1],
+        dilation,
+        ifm_channels,
+        ofm_channels,
+    )
+
     if layouts[0] == "NHCWB16":
         in_shape = [
             int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, in_shape + (1,)).tolist()[:-1]
@@ -321,9 +323,12 @@ def test_best_block_config(
     # Add tensors
     input_tensor = cs.Tensor(in_shape, "int8")
     part.set_input(0, input_tensor)
-    if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"):
+    if op_type == "ethosu_conv2d":
         weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8")
         part.set_input(1, weight_tensor)
+    elif op_type == "ethosu_depthwise_conv2d":
+        weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], 1], "int8")
+        part.set_input(1, weight_tensor)
 
     output_tensor = cs.Tensor(out_shape, "int8")
     part.set_output(output_tensor)
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
@@ -82,6 +82,7 @@ def test_ethosu_conv2d_matcher(
         ofm_layout,
         dilation,
         ifm_channels,
+        ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
@@ -83,6 +83,7 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou
         ifm_layout,
         ofm_layout,
         dilation,
+        ofm_channels=ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
@@ -66,6 +66,7 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay
         padding,
         ifm_layout,
         ofm_layout,
+        ofm_channels=ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")

Original file line number	Diff line number	Diff line change
`@@ -187,7 +187,7 @@ def conv2d_compute(`
`187`	`187`	`[1, 0, 0, 0, 0, 0],`
`188`	`188`	`[0, 1, 0, 0, 0, 0],`
`189`	`189`	`[0, 0, 0, 1, 0, 0],`
`190`		`- [0, 0, 16, 0, 1, -16],`
	`190`	`+ [0, 0, 0, 0, 0, ofm_channels],`
`191`	`191`	`[0, 0, 0, 0, 0, 1],`
`192`	`192`	`]`
`193`	`193`	`ifm_matrix = [`
Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,7 @@ def depthwise_conv2d_compute(`
`181`	`181`	`[1, 0, 0, 0, 0, 0],`
`182`	`182`	`[0, 1, 0, 0, 0, 0],`
`183`	`183`	`[0, 0, 0, 1, 0, 0],`
`184`		`- [0, 0, 16, 0, 1, -16],`
	`184`	`+ [0, 0, 0, 0, 0, channels],`
`185`	`185`	`[0, 0, 0, 0, 0, 1],`
`186`	`186`	`]`
`187`	`187`	`ifm_matrix = [`
Original file line number	Diff line number	Diff line change
`@@ -169,7 +169,7 @@ def pooling_compute(`
`169`	`169`	`[1, 0, 0, 0, 0, 0],`
`170`	`170`	`[0, 1, 0, 0, 0, 0],`
`171`	`171`	`[0, 0, 0, 1, 0, 0],`
`172`		`- [0, 0, 16, 0, 1, -16],`
	`172`	`+ [0, 0, 0, 0, 0, int(ofm_channels)],`
`173`	`173`	`[0, 0, 0, 0, 0, 1],`
`174`	`174`	`]`
`175`	`175`	`ifm_matrix = [`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ def test_ethosu_conv2d_matcher(`
`82`	`82`	`ofm_layout,`
`83`	`83`	`dilation,`
`84`	`84`	`ifm_channels,`
	`85`	`+ ofm_channels,`
`85`	`86`	`)`
`86`	`87`
`87`	`88`	`device_config = cs.EthosuDeviceConfig("ethos-u55-256")`
Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou`
`83`	`83`	`ifm_layout,`
`84`	`84`	`ofm_layout,`
`85`	`85`	`dilation,`
	`86`	`+ ofm_channels=ofm_channels,`
`86`	`87`	`)`
`87`	`88`
`88`	`89`	`device_config = cs.EthosuDeviceConfig("ethos-u55-256")`
Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,7 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay`
`66`	`66`	`padding,`
`67`	`67`	`ifm_layout,`
`68`	`68`	`ofm_layout,`
	`69`	`+ ofm_channels=ofm_channels,`
`69`	`70`	`)`
`70`	`71`
`71`	`72`	`device_config = cs.EthosuDeviceConfig("ethos-u55-256")`