Skip to content

Commit d0c7c78

Browse files
authored
[microNPU] Tweak a layout transform matrix (#10763)
* [microNPU] Fix layout transform matrix One of the layout transforms currently causes the cascader to stripe across B16 axis (which is not allowed), so change that and deal with the implications to the get_valid_block_configs. Change-Id: I04199f9f35fcc31618581567483cfb80d3b5aad2 * Reduce the duplication of layout transfrom matrices * Change the nhcwb16_to_nhwc matrix for binary and unary elementwise such that it matches the other NPU ops * Reduce the number of places where the same layout transform matrices are defined * Add documentation to the layout transform matrices
1 parent 62e0470 commit d0c7c78

File tree

15 files changed

+182
-227
lines changed

15 files changed

+182
-227
lines changed

python/tvm/contrib/ethosu/cascader/device_config.py

Lines changed: 50 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,23 @@ def is_partkernel(
439439

440440
return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8
441441

442+
def _get_input_banks(self, input_block_shape, input_bytewidth):
443+
input_bytes = input_block_shape.area() * self._align(
444+
input_block_shape.depth * input_bytewidth, 8
445+
)
446+
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
447+
input_banks = _round_up(input_banks, self._input_granularity)
448+
449+
return input_banks
450+
451+
def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
452+
acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
453+
acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
454+
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
455+
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
456+
457+
return acc_banks
458+
442459
def get_elementwise_block_config(
443460
self,
444461
ifm_propagator: Propagator,
@@ -533,16 +550,9 @@ def get_elementwise_block_config(
533550
input2_block.round_up(self._input_micro_block)
534551

535552
# Banks required for input block
536-
input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
537-
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
538-
input_banks = _round_up(input_banks, self._input_granularity)
539-
553+
input_banks = self._get_input_banks(input_block, input_bytewidth)
540554
# Banks required for input2 block
541-
input2_bytes = input2_block.area() * self._align(
542-
input2_block.depth * input_bytewidth, 8
543-
)
544-
input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
545-
input2_banks = _round_up(input2_banks, self._input_granularity)
555+
input2_banks = self._get_input_banks(input2_block, input_bytewidth)
546556

547557
# Check whether or not both IFMs fit into SHRAM
548558
if (input_banks + input2_banks) <= banks_available:
@@ -561,6 +571,29 @@ def get_elementwise_block_config(
561571

562572
return block_config
563573

574+
def _get_subkernel_propagator(
575+
self, op_attrs, ifm_propagator, input_layout, output_layout, depth
576+
):
577+
op_type = op_attrs.get("op")
578+
stride_h = int(op_attrs.get("stride_h", 1))
579+
stride_w = int(op_attrs.get("stride_w", 1))
580+
transform = ifm_propagator.transform
581+
582+
if input_layout == "NHCWB16":
583+
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
584+
transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
585+
else:
586+
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
587+
transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
588+
589+
if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
590+
if output_layout == "NHCWB16" and input_layout == "NHWC":
591+
transform[3][-1] = depth
592+
elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
593+
transform[2][-1] = depth // 16
594+
595+
return Propagator(transform, ifm_propagator.offset)
596+
564597
def get_valid_block_configs(
565598
self,
566599
ifm_propagator: Propagator,
@@ -612,33 +645,13 @@ def get_valid_block_configs(
612645
op_type = op_attrs.get("op")
613646
op_str = op_attrs.get("op_str")
614647
activation = op_attrs.get("activation", "NONE")
615-
stride_h = int(op_attrs.get("stride_h", 1))
616-
stride_w = int(op_attrs.get("stride_w", 1))
617648
upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2
618649

619-
subkernel_transform = ifm_propagator.transform
620650
if output_layout == "NHCWB16":
621651
output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
622652
else:
623653
output_shape = _Shape(ofm_shape)
624654

625-
if input_layout == "NHCWB16":
626-
subkernel_transform[1][-1] = min(
627-
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
628-
)
629-
subkernel_transform[3][-1] = min(
630-
subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
631-
)
632-
else:
633-
subkernel_transform[1][-1] = min(
634-
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
635-
)
636-
subkernel_transform[2][-1] = min(
637-
subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
638-
)
639-
640-
subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)
641-
642655
# Define search space
643656
max_height = min(output_shape.height, self._max_block_shape.height)
644657
min_height = max(self._micro_block.height, upscaling_factor)
@@ -655,7 +668,7 @@ def get_valid_block_configs(
655668
if activation == "LUT" and not self._lut_reserved:
656669
banks_available -= 2
657670

658-
# Input block depth has additional limitations for Operators that require full input depth
671+
# Input block depth has additional limitations for operators that require full input depth
659672
input_block_depth = 0
660673
is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
661674
if op_type == "ethosu_conv2d":
@@ -669,6 +682,10 @@ def get_valid_block_configs(
669682
# Block depth has to be less than full depth or a multiple of the split depth
670683
continue
671684

685+
subkernel_propagator = self._get_subkernel_propagator(
686+
op_attrs, ifm_propagator, input_layout, output_layout, depth
687+
)
688+
672689
for width in range(min_width, max_width + min_width, min_width):
673690
for height in range(min_height, max_height + min_height, min_height):
674691
if output_layout == "NHCWB16":
@@ -709,19 +726,11 @@ def get_valid_block_configs(
709726
input_block_shape.depth = input_block_depth
710727

711728
# Banks required for input block
712-
input_bytes = input_block_shape.area() * self._align(
713-
input_block_shape.depth * input_bytewidth, 8
714-
)
715-
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
716-
input_banks = _round_up(input_banks, self._input_granularity)
717-
729+
input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
718730
# Banks required for accumulation
719-
acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
720-
acc_bytes = (
721-
output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
731+
acc_banks = self._get_accumulator_banks(
732+
output_block_shape, acc_bytewidth, depth
722733
)
723-
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
724-
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
725734

726735
if (input_banks + acc_banks) <= banks_available:
727736
output_cycles = self._get_output_cycles(

python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
2323

2424
from .dma import dma_ofm_compute, dma_ifm_compute
25+
from .common import get_layout_transform_matrices
2526

2627

2728
def binary_elementwise_compute(
@@ -196,21 +197,8 @@ def binary_elementwise_compute(
196197
attrs=binary_elementwise_attrs,
197198
)
198199

199-
nhwc_to_nhcwb16 = [
200-
[1, 0, 0, 0, 0],
201-
[0, 1, 0, 0, 0],
202-
[0, 0, 0, 1 / 16, 0],
203-
[0, 0, 1, 0, 0],
204-
[0, 0, 0, 0, 16],
205-
[0, 0, 0, 0, 1],
206-
]
207-
nhcwb16_to_nhwc = [
208-
[1, 0, 0, 0, 0, 0],
209-
[0, 1, 0, 0, 0, 0],
210-
[0, 0, 0, 1, 0, 0],
211-
[0, 0, 16, 0, 1, -16],
212-
[0, 0, 0, 0, 0, 1],
213-
]
200+
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ifm_channels))
201+
214202
ifm_matrix = [
215203
[1, 0, 0, 0, 0],
216204
[0, 1, 0, 0, 0],
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
"""Common methods for the NPU tensor expressions"""
18+
19+
from typing import Tuple, List
20+
21+
22+
def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]], List[List[float]]]:
23+
"""Get the NHWC->NHCWB16 and NHCWB16->NHWC layout transform matrices.
24+
For information about the supported layouts see https://developer.arm.com/documentation/102420/
25+
0200/Functional-description/Control-and-data-flow/Supported-memory-formats-for-feature-maps
26+
27+
Parameters
28+
----------
29+
ofm_channels : int
30+
The number of output channels in a NHWC layout
31+
32+
Returns
33+
-------
34+
nhwc_to_nhcwb16, nhcwb16_to_nhwc : Tuple[List[List[float]], List[List[float]]]
35+
The layout transformation matrices
36+
"""
37+
38+
# The value of the last dimension (B16) is always 16.
39+
nhwc_to_nhcwb16 = [
40+
[1, 0, 0, 0, 0],
41+
[0, 1, 0, 0, 0],
42+
[0, 0, 0, 1 / 16, 0],
43+
[0, 0, 1, 0, 0],
44+
[0, 0, 0, 0, 16],
45+
[0, 0, 0, 0, 1],
46+
]
47+
48+
# When we convert from NHWC to NHCWB16, the new C value is given by
49+
# (ofm_channels - 1) // 16 + 1, which is a lossy operation, so we need to use
50+
# the actual value of channels in the transform matrix to accurately recover
51+
# the C in NHWC when we convert from NHCWB16 to NHWC.
52+
nhcwb16_to_nhwc = [
53+
[1, 0, 0, 0, 0, 0],
54+
[0, 1, 0, 0, 0, 0],
55+
[0, 0, 0, 1, 0, 0],
56+
[0, 0, 0, 0, 0, ofm_channels],
57+
[0, 0, 0, 0, 0, 1],
58+
]
59+
60+
return nhwc_to_nhcwb16, nhcwb16_to_nhwc

python/tvm/relay/backend/contrib/ethosu/te/convolution.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
2424

2525
from .dma import dma_ofm_compute, dma_ifm_compute
26+
from .common import get_layout_transform_matrices
2627

2728

2829
def conv2d_compute(
@@ -175,21 +176,8 @@ def conv2d_compute(
175176
attrs=conv2d_attrs,
176177
)
177178

178-
nhwc_to_nhcwb16 = [
179-
[1, 0, 0, 0, 0],
180-
[0, 1, 0, 0, 0],
181-
[0, 0, 0, 1 / 16, 0],
182-
[0, 0, 1, 0, 0],
183-
[0, 0, 0, 0, 16],
184-
[0, 0, 0, 0, 1],
185-
]
186-
nhcwb16_to_nhwc = [
187-
[1, 0, 0, 0, 0, 0],
188-
[0, 1, 0, 0, 0, 0],
189-
[0, 0, 0, 1, 0, 0],
190-
[0, 0, 16, 0, 1, -16],
191-
[0, 0, 0, 0, 0, 1],
192-
]
179+
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
180+
193181
ifm_matrix = [
194182
[1, 0, 0, 0, 0],
195183
[0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],

python/tvm/relay/backend/contrib/ethosu/te/depthwise.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
2424

2525
from .dma import dma_ofm_compute, dma_ifm_compute
26+
from .common import get_layout_transform_matrices
2627

2728

2829
def depthwise_conv2d_compute(
@@ -169,21 +170,8 @@ def depthwise_conv2d_compute(
169170
attrs=depthwise_conv2d_attrs,
170171
)
171172

172-
nhwc_to_nhcwb16 = [
173-
[1, 0, 0, 0, 0],
174-
[0, 1, 0, 0, 0],
175-
[0, 0, 0, 1 / 16, 0],
176-
[0, 0, 1, 0, 0],
177-
[0, 0, 0, 0, 16],
178-
[0, 0, 0, 0, 1],
179-
]
180-
nhcwb16_to_nhwc = [
181-
[1, 0, 0, 0, 0, 0],
182-
[0, 1, 0, 0, 0, 0],
183-
[0, 0, 0, 1, 0, 0],
184-
[0, 0, 16, 0, 1, -16],
185-
[0, 0, 0, 0, 0, 1],
186-
]
173+
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(channels)
174+
187175
ifm_matrix = [
188176
[1, 0, 0, 0, 0],
189177
[0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],

python/tvm/relay/backend/contrib/ethosu/te/pooling.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
2424

2525
from .dma import dma_ofm_compute, dma_ifm_compute
26+
from .common import get_layout_transform_matrices
2627

2728

2829
def pooling_compute(
@@ -157,21 +158,8 @@ def pooling_compute(
157158
attrs=pooling_attrs,
158159
)
159160

160-
nhwc_to_nhcwb16 = [
161-
[1, 0, 0, 0, 0],
162-
[0, 1, 0, 0, 0],
163-
[0, 0, 0, 1 / 16, 0],
164-
[0, 0, 1, 0, 0],
165-
[0, 0, 0, 0, 16],
166-
[0, 0, 0, 0, 1],
167-
]
168-
nhcwb16_to_nhwc = [
169-
[1, 0, 0, 0, 0, 0],
170-
[0, 1, 0, 0, 0, 0],
171-
[0, 0, 0, 1, 0, 0],
172-
[0, 0, 16, 0, 1, -16],
173-
[0, 0, 0, 0, 0, 1],
174-
]
161+
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))
162+
175163
ifm_matrix = [
176164
[1, 0, 0, 0, 0],
177165
[0, stride_h, 0, 0, (pool_shape_h - stride_h)],

python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from tvm import te
2222
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
2323
from .dma import dma_ofm_compute, dma_ifm_compute
24+
from .common import get_layout_transform_matrices
2425

2526

2627
def unary_elementwise_compute(
@@ -129,21 +130,8 @@ def clz_imp(inp):
129130
attrs=unary_elementwise_attrs,
130131
)
131132

132-
nhwc_to_nhcwb16 = [
133-
[1, 0, 0, 0, 0],
134-
[0, 1, 0, 0, 0],
135-
[0, 0, 0, 1 / 16, 0],
136-
[0, 0, 1, 0, 0],
137-
[0, 0, 0, 0, 16],
138-
[0, 0, 0, 0, 1],
139-
]
140-
nhcwb16_to_nhwc = [
141-
[1, 0, 0, 0, 0, 0],
142-
[0, 1, 0, 0, 0, 0],
143-
[0, 0, 0, 1, 0, 0],
144-
[0, 0, 16, 0, 1, -16],
145-
[0, 0, 0, 0, 0, 1],
146-
]
133+
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))
134+
147135
ifm_matrix = [
148136
[1, 0, 0, 0, 0],
149137
[0, 1, 0, 0, 0],

src/contrib/ethosu/cascader/block_config.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
3737
v->Visit("_input_shape", &tmp_arr);
3838
tmp_arr = make_array(output_shape_);
3939
v->Visit("_output_shape", &tmp_arr);
40+
v->Visit("_compute_cycles", &compute_cycles_);
41+
v->Visit("_output_cycles", &output_cycles_);
4042
}
4143

4244
BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,

0 commit comments

Comments
 (0)