Skip to content

Commit 23685d9

Browse files
committed
[microNPU] Fix layout transform matrix
One of the layout transforms currently causes the cascader to stripe across B16 axis (which is not allowed), so change that and deal with the implications to the get_valid_block_configs. Change-Id: I04199f9f35fcc31618581567483cfb80d3b5aad2
1 parent 7a6281e commit 23685d9

File tree

10 files changed

+88
-61
lines changed

10 files changed

+88
-61
lines changed

python/tvm/contrib/ethosu/cascader/device_config.py

Lines changed: 50 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,23 @@ def is_partkernel(
439439

440440
return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8
441441

442+
def _get_input_banks(self, input_block_shape, input_bytewidth):
443+
input_bytes = input_block_shape.area() * self._align(
444+
input_block_shape.depth * input_bytewidth, 8
445+
)
446+
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
447+
input_banks = _round_up(input_banks, self._input_granularity)
448+
449+
return input_banks
450+
451+
def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
452+
acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
453+
acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
454+
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
455+
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
456+
457+
return acc_banks
458+
442459
def get_elementwise_block_config(
443460
self,
444461
ifm_propagator: Propagator,
@@ -533,16 +550,9 @@ def get_elementwise_block_config(
533550
input2_block.round_up(self._input_micro_block)
534551

535552
# Banks required for input block
536-
input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
537-
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
538-
input_banks = _round_up(input_banks, self._input_granularity)
539-
553+
input_banks = self._get_input_banks(input_block, input_bytewidth)
540554
# Banks required for input2 block
541-
input2_bytes = input2_block.area() * self._align(
542-
input2_block.depth * input_bytewidth, 8
543-
)
544-
input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
545-
input2_banks = _round_up(input2_banks, self._input_granularity)
555+
input2_banks = self._get_input_banks(input2_block, input_bytewidth)
546556

547557
# Check whether or not both IFMs fit into SHRAM
548558
if (input_banks + input2_banks) <= banks_available:
@@ -561,6 +571,29 @@ def get_elementwise_block_config(
561571

562572
return block_config
563573

574+
def _get_subkernel_propagator(
575+
self, op_attrs, ifm_propagator, input_layout, output_layout, depth
576+
):
577+
op_type = op_attrs.get("op")
578+
stride_h = int(op_attrs.get("stride_h", 1))
579+
stride_w = int(op_attrs.get("stride_w", 1))
580+
transform = ifm_propagator.transform
581+
582+
if input_layout == "NHCWB16":
583+
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
584+
transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
585+
else:
586+
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
587+
transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
588+
589+
if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
590+
if output_layout == "NHCWB16" and input_layout == "NHWC":
591+
transform[3][-1] = depth
592+
elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
593+
transform[2][-1] = depth // 16
594+
595+
return Propagator(transform, ifm_propagator.offset)
596+
564597
def get_valid_block_configs(
565598
self,
566599
ifm_propagator: Propagator,
@@ -612,33 +645,13 @@ def get_valid_block_configs(
612645
op_type = op_attrs.get("op")
613646
op_str = op_attrs.get("op_str")
614647
activation = op_attrs.get("activation", "NONE")
615-
stride_h = int(op_attrs.get("stride_h", 1))
616-
stride_w = int(op_attrs.get("stride_w", 1))
617648
upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2
618649

619-
subkernel_transform = ifm_propagator.transform
620650
if output_layout == "NHCWB16":
621651
output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
622652
else:
623653
output_shape = _Shape(ofm_shape)
624654

625-
if input_layout == "NHCWB16":
626-
subkernel_transform[1][-1] = min(
627-
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
628-
)
629-
subkernel_transform[3][-1] = min(
630-
subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
631-
)
632-
else:
633-
subkernel_transform[1][-1] = min(
634-
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
635-
)
636-
subkernel_transform[2][-1] = min(
637-
subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
638-
)
639-
640-
subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)
641-
642655
# Define search space
643656
max_height = min(output_shape.height, self._max_block_shape.height)
644657
min_height = max(self._micro_block.height, upscaling_factor)
@@ -655,7 +668,7 @@ def get_valid_block_configs(
655668
if activation == "LUT" and not self._lut_reserved:
656669
banks_available -= 2
657670

658-
# Input block depth has additional limitations for Operators that require full input depth
671+
# Input block depth has additional limitations for operators that require full input depth
659672
input_block_depth = 0
660673
is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
661674
if op_type == "ethosu_conv2d":
@@ -669,6 +682,10 @@ def get_valid_block_configs(
669682
# Block depth has to be less than full depth or a multiple of the split depth
670683
continue
671684

685+
subkernel_propagator = self._get_subkernel_propagator(
686+
op_attrs, ifm_propagator, input_layout, output_layout, depth
687+
)
688+
672689
for width in range(min_width, max_width + min_width, min_width):
673690
for height in range(min_height, max_height + min_height, min_height):
674691
if output_layout == "NHCWB16":
@@ -709,19 +726,11 @@ def get_valid_block_configs(
709726
input_block_shape.depth = input_block_depth
710727

711728
# Banks required for input block
712-
input_bytes = input_block_shape.area() * self._align(
713-
input_block_shape.depth * input_bytewidth, 8
714-
)
715-
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
716-
input_banks = _round_up(input_banks, self._input_granularity)
717-
729+
input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
718730
# Banks required for accumulation
719-
acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
720-
acc_bytes = (
721-
output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
731+
acc_banks = self._get_accumulator_banks(
732+
output_block_shape, acc_bytewidth, depth
722733
)
723-
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
724-
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
725734

726735
if (input_banks + acc_banks) <= banks_available:
727736
output_cycles = self._get_output_cycles(

python/tvm/relay/backend/contrib/ethosu/te/convolution.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def conv2d_compute(
187187
[1, 0, 0, 0, 0, 0],
188188
[0, 1, 0, 0, 0, 0],
189189
[0, 0, 0, 1, 0, 0],
190-
[0, 0, 16, 0, 1, -16],
190+
[0, 0, 0, 0, 0, ofm_channels],
191191
[0, 0, 0, 0, 0, 1],
192192
]
193193
ifm_matrix = [

python/tvm/relay/backend/contrib/ethosu/te/depthwise.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def depthwise_conv2d_compute(
181181
[1, 0, 0, 0, 0, 0],
182182
[0, 1, 0, 0, 0, 0],
183183
[0, 0, 0, 1, 0, 0],
184-
[0, 0, 16, 0, 1, -16],
184+
[0, 0, 0, 0, 0, channels],
185185
[0, 0, 0, 0, 0, 1],
186186
]
187187
ifm_matrix = [

python/tvm/relay/backend/contrib/ethosu/te/pooling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def pooling_compute(
169169
[1, 0, 0, 0, 0, 0],
170170
[0, 1, 0, 0, 0, 0],
171171
[0, 0, 0, 1, 0, 0],
172-
[0, 0, 16, 0, 1, -16],
172+
[0, 0, 0, 0, 0, int(ofm_channels)],
173173
[0, 0, 0, 0, 0, 1],
174174
]
175175
ifm_matrix = [

src/contrib/ethosu/cascader/block_config.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,17 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
3737
v->Visit("_input_shape", &tmp_arr);
3838
tmp_arr = make_array(output_shape_);
3939
v->Visit("_output_shape", &tmp_arr);
40+
v->Visit("_compute_cycles", &compute_cycles_);
41+
v->Visit("_output_cycles", &output_cycles_);
4042
}
4143

4244
BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
4345
int compute_cycles, int output_cycles) {
4446
auto n = make_object<BlockConfigNode>();
4547
n->input_shape_ = std::move(input_shape);
4648
n->output_shape_ = std::move(output_shape);
47-
n->compute_cycles_ = compute_cycles;
48-
n->output_cycles_ = output_cycles;
49+
n->compute_cycles_ = std::move(compute_cycles);
50+
n->output_cycles_ = std::move(output_cycles);
4951
data_ = std::move(n);
5052
}
5153

tests/python/contrib/test_ethosu/cascader/infra.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,15 @@ def create_te_graph(func):
6464
return te_graph, consts
6565

6666
def make_matrices(
67-
op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1
67+
op_type,
68+
kernel,
69+
stride,
70+
padding,
71+
ifm_layout,
72+
ofm_layout,
73+
dilation=(1, 1),
74+
ifm_channels=1,
75+
ofm_channels=1,
6876
):
6977
kernel_h, kernel_w = kernel
7078
stride_h, stride_w = stride
@@ -83,7 +91,7 @@ def make_matrices(
8391
[1, 0, 0, 0, 0, 0],
8492
[0, 1, 0, 0, 0, 0],
8593
[0, 0, 0, 1, 0, 0],
86-
[0, 0, 16, 0, 1, -16],
94+
[0, 0, 0, 0, 0, ofm_channels],
8795
[0, 0, 0, 0, 0, 1],
8896
]
8997
if op_type == "ethosu_conv2d":

tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@
164164
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
165165
((1, 4, 4, 16), (1, 4, 1, 4, 16)),
166166
((1, 8, 4, 16), (1, 8, 1, 4, 16)),
167-
((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 16, 1, 4, 4)),
167+
((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
168168
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
169169
# Depthwise Conv2D
170170
((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -182,7 +182,7 @@
182182
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
183183
((1, 4, 4, 16), (1, 4, 1, 4, 16)),
184184
((1, 8, 4, 16), (1, 8, 1, 4, 16)),
185-
((1, 10, 6, 8), (1, 16, 1, 4, 8)),
185+
((1, 10, 6, 8), (1, 10, 1, 6, 8)),
186186
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
187187
# Depthwise Conv2D
188188
((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -252,20 +252,22 @@ def test_best_block_config(
252252
[0, 0, 0, 0, 16],
253253
[0, 0, 0, 0, 1],
254254
]
255-
nhcwb16_to_nhwc = [
256-
[1, 0, 0, 0, 0, 0],
257-
[0, 1, 0, 0, 0, 0],
258-
[0, 0, 0, 1, 0, 0],
259-
[0, 0, 16, 0, 1, -16],
260-
[0, 0, 0, 0, 0, 1],
261-
]
262-
ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
263-
op_type, kernel, stride, padding, layouts[0], layouts[1], dilation, in_shape[3]
264-
)
265255

266256
ofm_channels = out_shape[3]
267257
ifm_channels = in_shape[3]
268258

259+
ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
260+
op_type,
261+
kernel,
262+
stride,
263+
padding,
264+
layouts[0],
265+
layouts[1],
266+
dilation,
267+
ifm_channels,
268+
ofm_channels,
269+
)
270+
269271
if layouts[0] == "NHCWB16":
270272
in_shape = [
271273
int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, in_shape + (1,)).tolist()[:-1]
@@ -321,9 +323,12 @@ def test_best_block_config(
321323
# Add tensors
322324
input_tensor = cs.Tensor(in_shape, "int8")
323325
part.set_input(0, input_tensor)
324-
if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"):
326+
if op_type == "ethosu_conv2d":
325327
weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8")
326328
part.set_input(1, weight_tensor)
329+
elif op_type == "ethosu_depthwise_conv2d":
330+
weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], 1], "int8")
331+
part.set_input(1, weight_tensor)
327332

328333
output_tensor = cs.Tensor(out_shape, "int8")
329334
part.set_output(output_tensor)

tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def test_ethosu_conv2d_matcher(
8282
ofm_layout,
8383
dilation,
8484
ifm_channels,
85+
ofm_channels,
8586
)
8687

8788
device_config = cs.EthosuDeviceConfig("ethos-u55-256")

tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou
8383
ifm_layout,
8484
ofm_layout,
8585
dilation,
86+
ofm_channels=ofm_channels,
8687
)
8788

8889
device_config = cs.EthosuDeviceConfig("ethos-u55-256")

tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay
6666
padding,
6767
ifm_layout,
6868
ofm_layout,
69+
ofm_channels=ofm_channels,
6970
)
7071

7172
device_config = cs.EthosuDeviceConfig("ethos-u55-256")

0 commit comments

Comments
 (0)