Skip to content

Commit 0b95780

Browse files
authored
[microNPU] Cascader performance model bugfixes (#10510)
* [microNPU] Performance model bugfixes * Fixed incorrect num_blocks calculations for both BufferModes. * Fixed similar issues with Read/Write byte calculations. * Fixed an issue where the 'partkernel' flag was not propagated to the performance estimation code. * Fixed single buffering check incorrectly used output shape and block rather than the input shape and block. * Fixed block config not aligned to micro block for Elementwise. Change-Id: Ide6b231bc1a17c65bed20129d2179a215ada14b2 * Address review comment Changed incorrect usage of 'max_width' to 'max_depth'.
1 parent 8cf0c3e commit 0b95780

File tree

5 files changed

+45
-35
lines changed

5 files changed

+45
-35
lines changed

python/tvm/contrib/ethosu/cascader/device_config.py

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def _get_input_block(
288288
input_shape: _Shape,
289289
dtype: str,
290290
op_type: str,
291-
is_partkernel: bool,
291+
partkernel: bool,
292292
stride_h: int,
293293
stride_w: int,
294294
dilated_kernel_h: int,
@@ -310,7 +310,7 @@ def _get_input_block(
310310

311311
if op_type == "ethosu_conv2d":
312312
if dtype == "int8":
313-
if is_partkernel:
313+
if partkernel:
314314
depth = self._align(min(32, input_shape.depth), 8)
315315
else:
316316
depth = self._align(min(16, input_shape.depth), 8)
@@ -336,7 +336,7 @@ def get_kernel_steps(
336336
dilated_kernel_h: int,
337337
dilated_kernel_w: int,
338338
ifm_dtype: str,
339-
is_partkernel: bool = False,
339+
partkernel: bool = False,
340340
) -> List[int]:
341341
"""Calculate the total number of subkernels and their sizes
342342
@@ -351,7 +351,7 @@ def get_kernel_steps(
351351
Width of dilated kernel
352352
ifm_dtype: str
353353
Datatype of the Input Feature Map tensor (IFM)
354-
is_partkernel: bool
354+
partkernel: bool
355355
Flag showing whether part-kernel first traversal is used
356356
357357
Returns
@@ -368,7 +368,7 @@ def get_kernel_steps(
368368
kernel_steps = []
369369
for y, x in subkernels:
370370
subkernel_elements = x * y
371-
if op_type == "ethosu_conv2d" and is_partkernel:
371+
if op_type == "ethosu_conv2d" and partkernel:
372372
# Part-kernel-first traversal conv2d
373373
divisor = 4 if ifm_dtype == "int8" else 2
374374
kernel_steps.append(int(_round_up_div(subkernel_elements, divisor)))
@@ -509,29 +509,31 @@ def get_elementwise_block_config(
509509
banks_available -= 2
510510

511511
# Split the block in half until it fits into SHRAM
512+
max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
512513
if output_layout == "NHCWB16":
513514
split_order = (a for a in [1, 3, 2])
514515
output_block = [
515516
output_shape[0],
516-
min(output_shape[1], self._max_block_shape.height),
517-
min(output_shape[2] * output_shape[4], self._max_block_shape.depth),
518-
min(output_shape[3], self._max_block_shape.width),
517+
_round_up(min(output_shape[1], max_height), self._micro_block.height),
518+
min(output_shape[2] * output_shape[4], max_depth),
519+
_round_up(min(output_shape[3], max_width), self._micro_block.width),
519520
16,
520521
]
521522
else:
522523
split_order = (a for a in [1, 2, 3])
523524
output_block = [
524525
output_shape[0],
525-
min(output_shape[1], self._max_block_shape.height),
526-
min(output_shape[2], self._max_block_shape.width),
527-
min(output_shape[3], self._max_block_shape.depth),
526+
_round_up(min(output_shape[1], max_height), self._micro_block.height),
527+
_round_up(min(output_shape[2], max_width), self._micro_block.width),
528+
_round_up(min(output_shape[3], max_depth), self._micro_block.depth),
528529
]
529530
split_axis = next(split_order)
531+
532+
offset = [0] * len(output_block)
533+
stripes = [1] * len(output_block)
534+
order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
530535
while True:
531536
# Create stripe config for output block
532-
offset = [0] * len(output_block)
533-
stripes = [1] * len(output_block)
534-
order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
535537
output_stripe_config = StripeConfig(
536538
output_block, output_block, output_block, order, stripes, offset
537539
)
@@ -564,10 +566,12 @@ def get_elementwise_block_config(
564566
block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
565567
break
566568

567-
if output_block[split_axis] == 1:
569+
if output_block[split_axis] == self._micro_block.as_list()[split_axis]:
568570
split_axis = next(split_order)
569571

570-
output_block[split_axis] = _round_up_div(output_block[split_axis], 2)
572+
output_block[split_axis] = _round_up(
573+
_round_up_div(output_block[split_axis], 2), self._micro_block.as_list()[split_axis]
574+
)
571575

572576
return block_config
573577

@@ -670,9 +674,9 @@ def get_valid_block_configs(
670674

671675
# Input block depth has additional limitations for operators that require full input depth
672676
input_block_depth = 0
673-
is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
677+
partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
674678
if op_type == "ethosu_conv2d":
675-
if is_partkernel:
679+
if partkernel:
676680
input_block_depth = min(ifm_channels, 16)
677681
else:
678682
input_block_depth = min(ifm_channels, 32)
@@ -745,7 +749,8 @@ def get_valid_block_configs(
745749
kernel_h,
746750
kernel_w,
747751
ifm_channels,
748-
is_partkernel,
752+
"int8",
753+
partkernel,
749754
)
750755
block_config = BlockConfig(
751756
input_block_shape.as_list(), output_block, compute_cycles, output_cycles
@@ -767,15 +772,15 @@ def _estimate_compute_cycles_per_block(
767772
kernel_w: int,
768773
input_channels: int,
769774
ifm_dtype: str,
770-
is_partkernel: bool = False,
775+
partkernel: bool = False,
771776
) -> Tuple[int, int]:
772777
# Calculate the amount of micro blocks per block, per axis
773778
num_quantum_x = _round_up_div(block_shape.width, self._micro_block.width)
774779
num_quantum_y = _round_up_div(block_shape.height, self._micro_block.height)
775780
num_quantum_z = _round_up_div(block_shape.depth, self._micro_block.depth)
776781
num_quantum_xy = num_quantum_x * num_quantum_y
777782

778-
kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, is_partkernel)
783+
kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, partkernel)
779784

780785
wd_cycles = self._get_weight_decoder_cycles(op_type)
781786
delay_cycles = self._get_delay_cycles(op_type, ifm_dtype)
@@ -794,7 +799,7 @@ def _estimate_compute_cycles_per_block(
794799
elif subkernel_steps > 1:
795800
compute_cycles += delay_cycles * (subkernel_steps - 1) * num_quantum_z
796801

797-
if is_partkernel:
802+
if partkernel:
798803
compute_cycles *= _round_up_div(input_block_shape.depth, 8)
799804

800805
if op_type == "ethosu_conv2d":

src/contrib/ethosu/cascader/parts/ethosu.cc

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri
7474
BlockConfig best_block_config;
7575
float best_cost = std::numeric_limits<float>::infinity();
7676
std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
77+
auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
78+
std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
7779

7880
for (const auto& block_config : valid_block_configs_) {
7981
std::vector<int> output_block = block_config->GetOutputBlockShape();
@@ -86,7 +88,7 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri
8688
mul_reduce(output_stripe_shape);
8789

8890
// Single buffering hardware optimization
89-
if (mul_reduce(output_stripe_shape) <= 2 * mul_reduce(output_block)) {
91+
if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
9092
relative_cost /= 2;
9193
}
9294

@@ -107,25 +109,25 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out
107109
std::vector<int64_t> bytes_per_input =
108110
GetBytesRead(block_shape, output_stripe_config->GetShape());
109111

110-
int elements_per_block = mul_reduce(block_shape);
111-
int bytes_per_output = elements_per_block;
112112
float num_blocks = 1.0f;
113113
for (size_t i = 0; i < block_shape.size(); i++) {
114114
if (buffer_mode == BufferMode::RECOMPUTE) {
115-
num_blocks *= static_cast<float>(output_stripe_config->GetShape()[i] *
116-
output_stripe_config->GetStripes()[i]) /
117-
block_shape[i];
115+
num_blocks *= std::max(static_cast<float>(output_stripe_config->GetShape()[i]) /
116+
block_shape[i] * output_stripe_config->GetStripes()[i],
117+
1.0f);
118118
} else {
119119
num_blocks *=
120-
std::max(static_cast<float>(output_stripe_config->GetExtent()[i]) / block_shape[i], 1.0f);
120+
std::max(static_cast<float>(output_tensor_->GetShape()[i]) / block_shape[i], 1.0f);
121121
}
122122
}
123-
float num_stripes = mul_reduce(output_stripe_config->GetStripes()) - 1.0f;
123+
124+
float num_stripes = mul_reduce(output_stripe_config->GetStripes());
124125
std::vector<int64_t> read_bytes;
125-
for (int block_bytes : bytes_per_input) {
126-
read_bytes.push_back((num_blocks + num_stripes) * block_bytes);
126+
for (int64_t stripe_bytes : bytes_per_input) {
127+
read_bytes.push_back(num_stripes * stripe_bytes);
127128
}
128-
int64_t write_bytes = (num_blocks + num_stripes) * bytes_per_output;
129+
int64_t write_bytes =
130+
num_blocks * mul_reduce(block_shape) * output_tensor_->GetDataType().bytes();
129131

130132
int block_output_cycles = block_config->GetOutputCycles();
131133
int block_compute_cycles = block_config->GetComputeCycles();

tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@
207207
((1, 7, 10, 16), (1, 7, 1, 10, 16)),
208208
((1, 7, 6, 16), (1, 7, 1, 6, 16)),
209209
# Pooling
210-
((1, 1, 2, 80), (1, 1, 5, 2, 16)),
210+
((1, 1, 2, 16), (1, 1, 1, 2, 16)),
211211
((1, 10, 6, 16), (1, 10, 1, 6, 16)),
212212
],
213213
),
@@ -225,7 +225,7 @@
225225
((1, 8, 20, 16), (1, 8, 1, 20, 16)),
226226
((1, 14, 6, 16), (1, 14, 1, 6, 16)),
227227
# Pooling
228-
((1, 2, 2, 48), (1, 2, 3, 2, 16)),
228+
((1, 2, 2, 16), (1, 2, 1, 2, 16)),
229229
((1, 10, 12, 16), (1, 10, 1, 12, 16)),
230230
],
231231
),

tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ def test_ethosu_part():
4747
)
4848
input_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")
4949
part.set_input(0, input_tensor)
50+
output_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")
51+
part.set_output(output_tensor)
5052

5153
assert part.get_stripe_align_hint() == output_quantum
5254
# Check that the performance model runs, don't verify output

tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ def test_conv_performance(
216216
)
217217
part.set_input(0, cs.Tensor(in_shape, "int8"))
218218
part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8"))
219+
part.set_output(cs.Tensor(out_shape, "int8"))
219220

220221
stripes = [1] * len(output_quantum)
221222
offset = [0] * len(output_quantum)

0 commit comments

Comments
 (0)