diff --git a/python/tvm/contrib/ethosu/cascader/block_config.py b/python/tvm/contrib/ethosu/cascader/block_config.py index 3281b8a3606f..f246918cf490 100644 --- a/python/tvm/contrib/ethosu/cascader/block_config.py +++ b/python/tvm/contrib/ethosu/cascader/block_config.py @@ -28,11 +28,21 @@ class BlockConfig(Object): """BlockConfig class""" - def __init__(self, output_shape: List[int], compute_cycles: int, output_cycles: int): + def __init__( + self, + input_shape: List[int], + output_shape: List[int], + compute_cycles: int, + output_cycles: int, + ): self.__init_handle_by_constructor__( - _ffi_api.BlockConfig, output_shape, compute_cycles, output_cycles + _ffi_api.BlockConfig, input_shape, output_shape, compute_cycles, output_cycles ) + @property + def input_shape(self) -> List[int]: + return list(self._input_shape) + @property def output_shape(self) -> List[int]: return list(self._output_shape) diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py index 68a218da2616..4670a238cf96 100644 --- a/python/tvm/contrib/ethosu/cascader/device_config.py +++ b/python/tvm/contrib/ethosu/cascader/device_config.py @@ -551,7 +551,7 @@ def get_elementwise_block_config( ) output_cycles *= reduce(lambda a, b: a * b, output_block, 1) output_cycles = int(math.ceil(output_cycles)) - block_config.append(BlockConfig(output_block, 0, output_cycles)) + block_config.append(BlockConfig(output_block, output_block, 0, output_cycles)) break if output_block[split_axis] == 1: @@ -738,9 +738,10 @@ def get_valid_block_configs( ifm_channels, is_partkernel, ) - valid_block_configs.append( - BlockConfig(output_block, compute_cycles, output_cycles) + block_config = BlockConfig( + input_block_shape.as_list(), output_block, compute_cycles, output_cycles ) + valid_block_configs.append(block_config) else: # Block config does not fit into SHRAM # Any Block config that is strictly larger than this one will also fail diff --git a/python/tvm/contrib/ethosu/cascader/graph.py b/python/tvm/contrib/ethosu/cascader/graph.py index 7aa4a26513cd..ca0d8fef9e16 100644 --- a/python/tvm/contrib/ethosu/cascader/graph.py +++ b/python/tvm/contrib/ethosu/cascader/graph.py @@ -57,6 +57,10 @@ def read_bytes(self): def write_bytes(self): return self._write_bytes + @property + def block_config(self): + return self._block_config + @tvm._ffi.register_object("contrib.ethosu.cascader.Tensor") class Tensor(Object): diff --git a/python/tvm/contrib/ethosu/cascader/tensor_config.py b/python/tvm/contrib/ethosu/cascader/tensor_config.py index 6787ea4f052e..9e48f183ce7b 100644 --- a/python/tvm/contrib/ethosu/cascader/tensor_config.py +++ b/python/tvm/contrib/ethosu/cascader/tensor_config.py @@ -58,9 +58,25 @@ class MemoryRegion(Object): """ - def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: int): + def __init__( + self, + name: str, + size: int, + read_bandwidth: int, + write_bandwidth: int, + read_latency: int = 0, + write_latency: int = 0, + burst_length: int = 1, + ): self.__init_handle_by_constructor__( - _ffi_api.MemoryRegion, name, size, read_bandwidth, write_bandwidth + _ffi_api.MemoryRegion, + name, + size, + read_bandwidth, + write_bandwidth, + read_latency, + write_latency, + burst_length, ) diff --git a/src/contrib/ethosu/cascader/block_config.cc b/src/contrib/ethosu/cascader/block_config.cc index fe698aa17aac..afa65de01356 100644 --- a/src/contrib/ethosu/cascader/block_config.cc +++ b/src/contrib/ethosu/cascader/block_config.cc @@ -33,13 +33,16 @@ namespace ethosu { namespace cascader { void BlockConfigNode::VisitAttrs(AttrVisitor* v) { - Array tmp_arr = make_array(output_shape_); + Array tmp_arr = make_array(input_shape_); + v->Visit("_input_shape", &tmp_arr); + tmp_arr = make_array(output_shape_); v->Visit("_output_shape", &tmp_arr); } -BlockConfig::BlockConfig(const std::vector& output_shape, int compute_cycles, - int output_cycles) { +BlockConfig::BlockConfig(const std::vector& input_shape, const std::vector& output_shape, + int compute_cycles, int output_cycles) { auto n = make_object(); + n->input_shape_ = std::move(input_shape); n->output_shape_ = std::move(output_shape); n->compute_cycles_ = compute_cycles; n->output_cycles_ = output_cycles; @@ -47,9 +50,11 @@ BlockConfig::BlockConfig(const std::vector& output_shape, int compute_cycle } TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.BlockConfig") - .set_body_typed([](Array output_shape, int compute_cycles, int output_cycles) { + .set_body_typed([](Array input_shape, Array output_shape, int compute_cycles, + int output_cycles) { + std::vector vinput_shape = make_vector(input_shape); std::vector voutput_shape = make_vector(output_shape); - return BlockConfig(voutput_shape, compute_cycles, output_cycles); + return BlockConfig(vinput_shape, voutput_shape, compute_cycles, output_cycles); }); TVM_REGISTER_NODE_TYPE(BlockConfigNode); diff --git a/src/contrib/ethosu/cascader/block_config.h b/src/contrib/ethosu/cascader/block_config.h index d7da1d90e82e..5e349cee4d06 100644 --- a/src/contrib/ethosu/cascader/block_config.h +++ b/src/contrib/ethosu/cascader/block_config.h @@ -42,6 +42,12 @@ class BlockConfigNode : public Object { public: void VisitAttrs(AttrVisitor* v); + /*! + * \brief Get the shape of input block. + * \return The input shape of the block config. + */ + inline std::vector GetInputBlockShape() const { return input_shape_; } + /*! * \brief Get the shape of output block. * \return The output shape of the block config. @@ -66,6 +72,8 @@ class BlockConfigNode : public Object { protected: friend class BlockConfig; + /*! \brief The shape of the input block */ + std::vector input_shape_; /*! \brief The shape of the output block */ std::vector output_shape_; /*! \brief Cycles required to compute this block */ @@ -80,7 +88,8 @@ class BlockConfigNode : public Object { */ class BlockConfig : public ObjectRef { public: - BlockConfig(const std::vector& output_shape, int compute_cycles, int output_cycles); + BlockConfig(const std::vector& input_shape, const std::vector& output_shape, + int compute_cycles, int output_cycles); TVM_DEFINE_OBJECT_REF_METHODS(BlockConfig, ObjectRef, BlockConfigNode); }; diff --git a/src/contrib/ethosu/cascader/graph.cc b/src/contrib/ethosu/cascader/graph.cc index ce28f728d838..96f9768d3172 100644 --- a/src/contrib/ethosu/cascader/graph.cc +++ b/src/contrib/ethosu/cascader/graph.cc @@ -42,6 +42,7 @@ void PerformanceInfoNode::VisitAttrs(AttrVisitor* v) { Array tmp_reads = make_array(read_bytes); v->Visit("_read_bytes", &tmp_reads); v->Visit("_write_bytes", &write_bytes); + v->Visit("_block_config", &block_config); } TVM_REGISTER_NODE_TYPE(PerformanceInfoNode); diff --git a/src/contrib/ethosu/cascader/graph.h b/src/contrib/ethosu/cascader/graph.h index 81cbd1c9da5f..4233493ee805 100644 --- a/src/contrib/ethosu/cascader/graph.h +++ b/src/contrib/ethosu/cascader/graph.h @@ -33,6 +33,7 @@ #include #include +#include "block_config.h" #include "propagator.h" namespace tvm { @@ -71,6 +72,8 @@ class PerformanceInfoNode : public Object { std::vector read_bytes; /*! \brief The number of bytes written to the output tensor */ int64_t write_bytes; + /*! \brief The block config used for this performance point */ + BlockConfig block_config; static constexpr const char* _type_key = "contrib.ethosu.cascader.PerformanceInfo"; TVM_DECLARE_FINAL_OBJECT_INFO(PerformanceInfoNode, Object); @@ -85,11 +88,13 @@ class PerformanceInfoNode : public Object { */ class PerformanceInfo : public ObjectRef { public: - PerformanceInfo(int64_t compute_cycles, std::vector read_bytes, int64_t write_bytes) { + PerformanceInfo(int64_t compute_cycles, std::vector read_bytes, int64_t write_bytes, + BlockConfig block_config) { auto n = make_object(); n->compute_cycles = compute_cycles; n->read_bytes = std::move(read_bytes); n->write_bytes = write_bytes; + n->block_config = block_config; data_ = std::move(n); } diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc index cdbbda18c142..4bc270750f1a 100644 --- a/src/contrib/ethosu/cascader/parts/ethosu.cc +++ b/src/contrib/ethosu/cascader/parts/ethosu.cc @@ -57,7 +57,8 @@ const std::vector EthosuPartNode::GetBytesRead(const std::vector& for (const auto& input_block_config : input_block_configs) { std::map, int> input_blocks = CountStripes(input_block_config, false); for (const auto& block : input_blocks) { - bytes_per_input[i] += mul_reduce(block.first) * block.second; + bytes_per_input[i] += + mul_reduce(block.first) * block.second * input_tensors_[i]->GetDataType().bytes(); } i++; } @@ -136,7 +137,7 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out total_cycles = (block_compute_cycles * num_blocks) + block_output_cycles; } - PerformanceInfo info(total_cycles, read_bytes, write_bytes); + PerformanceInfo info(total_cycles, read_bytes, write_bytes, block_config); return info; } diff --git a/src/contrib/ethosu/cascader/parts/inline.cc b/src/contrib/ethosu/cascader/parts/inline.cc index cb216e7d1454..8854bbd90e81 100644 --- a/src/contrib/ethosu/cascader/parts/inline.cc +++ b/src/contrib/ethosu/cascader/parts/inline.cc @@ -23,6 +23,7 @@ #include #include +#include "../block_config.h" #include "../common.h" namespace tvm { @@ -33,7 +34,8 @@ namespace cascader { const PerformanceInfo InlinePartNode::GetPerformanceInfo(const StripeConfig& output_stripe_config, BufferMode buffer_mode) { std::vector read_bytes(input_tensors_.size()); - PerformanceInfo info(0, read_bytes, 0); + BlockConfig block_config = BlockConfig(std::vector(1, 1), std::vector(1, 1), 0, 0); + PerformanceInfo info(0, read_bytes, 0, block_config); return info; } diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc index 9acffb7e9479..a8715c9a9796 100644 --- a/src/contrib/ethosu/cascader/plan_generator.cc +++ b/src/contrib/ethosu/cascader/plan_generator.cc @@ -33,6 +33,7 @@ #include #include +#include "block_config.h" #include "cascader_options.h" #include "common.h" #include "graph.h" @@ -70,6 +71,21 @@ std::vector> EnumerateCombinations(std::vector> va return new_combs; } +float GetTransferEfficiency(const Tensor& tensor, const std::vector& block_shape, + const MemoryRegion& memory) { + // The block_shape represents the shape of the data transfer required for each job. This is used + // to calculate how much of the block_shape is contiguous in memory (source memory for a read or + // destination memory for a write) and subsequently calculate how efficient each memory burst is. + const auto& shape = tensor->GetShape(); + int burst_length = block_shape[block_shape.size() - 1]; + if (block_shape[block_shape.size() - 1] == shape[shape.size() - 1]) { + burst_length *= block_shape[block_shape.size() - 2]; + } + + burst_length *= tensor->GetDataType().bytes(); + return static_cast(memory->burst_length) / std::min(burst_length, memory->burst_length); +} + std::vector GetCascadableAxes(const Part& part) { std::vector cascadable_axes(part->GetOutputTensor()->GetShape().size()); // Check all the propagators to see if an output axis is projected into any @@ -322,6 +338,7 @@ std::vector GenerateSinglePlans( int bandwidth_cycles = 0; int compute_cycles = 0; int mem2mem_cycles = 0; + int initial_mem2mem_cycles = 0; // Pick the correct performance info based on the BufferMode PerformanceInfo perf_info; @@ -332,32 +349,52 @@ std::vector GenerateSinglePlans( } // Calculate the bandwidth cycles by multiplying the bytes read/written by the // bandwidth of the memories + BlockConfig block_config = perf_info->block_config; for (size_t i = 0; i < input_configs.size(); i++) { - bandwidth_cycles += - perf_info->read_bytes[i] / input_configs[i]->GetCopyRegion()->read_bandwidth; + Tensor tensor = input_configs[i]->GetTensor(); + MemoryRegion home_region = input_configs[i]->GetHomeRegion(); + MemoryRegion copy_region = input_configs[i]->GetCopyRegion(); + if (input_configs[i]->DoCopy()) { // This Tensor needs to be copied - Count stripes for this config - Tensor tensor = input_configs[i]->GetTensor(); for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) { std::map, int> input_blocks = CountStripes(stripe_config, true); + bool first_block = true; for (const auto& block : input_blocks) { int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() * tensor->GetCompressionRatio() * block.second; - int read_cycles = - bytes_transferred * input_configs[i]->GetHomeRegion()->read_bandwidth; - int write_cycles = - bytes_transferred * input_configs[i]->GetCopyRegion()->write_bandwidth; + int read_cycles = bytes_transferred * home_region->read_bandwidth + + input_configs[i]->GetHomeRegion()->read_latency; + int write_cycles = bytes_transferred * copy_region->write_bandwidth; + + if (first_block) { + first_block = false; + initial_mem2mem_cycles += std::max(read_cycles, write_cycles); + } mem2mem_cycles += std::max(read_cycles, write_cycles); } } } + float read_efficiency = + GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region); + bandwidth_cycles += + (perf_info->read_bytes[i] / copy_region->read_bandwidth) * read_efficiency; } + MemoryRegion write_region = output_config->GetCopyRegion(); + float write_efficiency = GetTransferEfficiency( + output_config->GetTensor(), block_config->GetOutputBlockShape(), write_region); + bandwidth_cycles += - perf_info->write_bytes / output_config->GetCopyRegion()->write_bandwidth; + perf_info->write_bytes / write_region->write_bandwidth * write_efficiency; compute_cycles = perf_info->compute_cycles; // Take the max of compute and bandwidth cycles as we assume compute cycles // can hide memory latency int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles); + if (cycles > mem2mem_cycles) { + // NPU cycles are the bottleneck - add initial mem2mem transfer cycles + cycles += initial_mem2mem_cycles; + } + int memory_usage = GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region); plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group, diff --git a/src/contrib/ethosu/cascader/tensor_config.cc b/src/contrib/ethosu/cascader/tensor_config.cc index 5e60f522fe4e..fc9abd7346e1 100644 --- a/src/contrib/ethosu/cascader/tensor_config.cc +++ b/src/contrib/ethosu/cascader/tensor_config.cc @@ -38,11 +38,16 @@ void MemoryRegionNode::VisitAttrs(AttrVisitor* v) { v->Visit("size", &size); v->Visit("read_bandwidth", &read_bandwidth); v->Visit("write_bandwidth", &write_bandwidth); + v->Visit("read_latency", &read_latency); + v->Visit("write_latency", &write_latency); + v->Visit("burst_length", &burst_length); } TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.MemoryRegion") - .set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth) { - return MemoryRegion(name, size, read_bandwidth, write_bandwidth); + .set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth, + int read_latency, int write_latency, int burst_length) { + return MemoryRegion(name, size, read_bandwidth, write_bandwidth, read_latency, write_latency, + burst_length); }); TVM_REGISTER_NODE_TYPE(MemoryRegionNode); diff --git a/src/contrib/ethosu/cascader/tensor_config.h b/src/contrib/ethosu/cascader/tensor_config.h index 6a37f76ce085..134e02c3e4cf 100644 --- a/src/contrib/ethosu/cascader/tensor_config.h +++ b/src/contrib/ethosu/cascader/tensor_config.h @@ -52,6 +52,12 @@ class MemoryRegionNode : public Object { int read_bandwidth; /*! \brief The write bandwidth of the region in bytes per cycle */ int write_bandwidth; + /*! \brief The read bandwidth of the region in bytes per cycle */ + int read_latency; + /*! \brief The write bandwidth of the region in bytes per cycle */ + int write_latency; + /*! \brief Length of memory burst */ + int burst_length; static constexpr const char* _type_key = "contrib.ethosu.cascader.MemoryRegion"; TVM_DECLARE_FINAL_OBJECT_INFO(MemoryRegionNode, Object) @@ -59,12 +65,16 @@ class MemoryRegionNode : public Object { class MemoryRegion : public ObjectRef { public: - MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth) { + MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth, + int read_latency, int write_latency, int burst_length) { auto n = make_object(); n->name = name; n->size = size; n->read_bandwidth = read_bandwidth; n->write_bandwidth = write_bandwidth; + n->read_latency = read_latency; + n->write_latency = write_latency; + n->burst_length = burst_length; data_ = std::move(n); } diff --git a/tests/python/contrib/test_ethosu/cascader/conftest.py b/tests/python/contrib/test_ethosu/cascader/conftest.py index cffaf83df0bc..1d55067929fa 100644 --- a/tests/python/contrib/test_ethosu/cascader/conftest.py +++ b/tests/python/contrib/test_ethosu/cascader/conftest.py @@ -27,17 +27,41 @@ @pytest.fixture def FLASH(): - return cs.MemoryRegion(name="FLASH", size=10 ** 7, read_bandwidth=4, write_bandwidth=4) + return cs.MemoryRegion( + name="FLASH", + size=10 ** 7, + read_bandwidth=4, + write_bandwidth=4, + read_latency=0, + write_latency=0, + burst_length=1, + ) @pytest.fixture def DRAM(): - return cs.MemoryRegion(name="DRAM", size=10 ** 9, read_bandwidth=8, write_bandwidth=8) + return cs.MemoryRegion( + name="DRAM", + size=10 ** 9, + read_bandwidth=8, + write_bandwidth=8, + read_latency=0, + write_latency=0, + burst_length=1, + ) @pytest.fixture def SRAM(): - return cs.MemoryRegion(name="SRAM", size=10 ** 6, read_bandwidth=16, write_bandwidth=16) + return cs.MemoryRegion( + name="SRAM", + size=10 ** 6, + read_bandwidth=16, + write_bandwidth=16, + read_latency=0, + write_latency=0, + burst_length=1, + ) if ethosu_enabled: diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py index 3f3935fff1f9..18f15f9257db 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py @@ -318,6 +318,15 @@ def test_best_block_config( block_configs, 1, ) + # Add tensors + input_tensor = cs.Tensor(in_shape, "int8") + part.set_input(0, input_tensor) + if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"): + weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8") + part.set_input(1, weight_tensor) + + output_tensor = cs.Tensor(out_shape, "int8") + part.set_output(output_tensor) order = [1, 2, 3, 4] if layouts[1] == "NHCWB16" else [1, 2, 4, 3, 0] stripes = [1] * len(output_quantum) diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py index fca136cf4ab4..bf6fb4579bd1 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py @@ -35,7 +35,7 @@ def test_ethosu_part(): ) subkernels = 3 - valid_block_configs = [cs.BlockConfig([1, 2, 4, 16], 15000, 7500)] + valid_block_configs = [cs.BlockConfig([1, 2, 4, 16], [1, 2, 4, 16], 15000, 7500)] part = EthosuPart( te_subgraph, diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py index ba6346afa5d5..60d5fa2a463d 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py @@ -200,7 +200,9 @@ def test_conv_performance( "int8", is_partkernel, ) - block_configs = [cs.BlockConfig(block_shape, compute_cycles, int(output_cycles))] + block_configs = [ + cs.BlockConfig(input_block_shape, block_shape, compute_cycles, int(output_cycles)) + ] output_quantum = [1, 1, 2, 8] te_subgraph = cs.TESubgraph([], None) @@ -212,6 +214,8 @@ def test_conv_performance( block_configs, 1, ) + part.set_input(0, cs.Tensor(in_shape, "int8")) + part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8")) stripes = [1] * len(output_quantum) offset = [0] * len(output_quantum)