-
Notifications
You must be signed in to change notification settings - Fork 3.7k
[microNPU] Improve cascader memory transfer estimates #10508
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,9 +58,25 @@ class MemoryRegion(Object): | |
|
|
||
| """ | ||
|
|
||
| def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: int): | ||
| def __init__( | ||
| self, | ||
| name: str, | ||
| size: int, | ||
| read_bandwidth: int, | ||
| write_bandwidth: int, | ||
| read_latency: int = 0, | ||
| write_latency: int = 0, | ||
| burst_length: int = 1, | ||
|
Comment on lines
+67
to
+69
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: add these to the docstring as well |
||
| ): | ||
| self.__init_handle_by_constructor__( | ||
| _ffi_api.MemoryRegion, name, size, read_bandwidth, write_bandwidth | ||
| _ffi_api.MemoryRegion, | ||
| name, | ||
| size, | ||
| read_bandwidth, | ||
| write_bandwidth, | ||
| read_latency, | ||
| write_latency, | ||
| burst_length, | ||
| ) | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |||||||||
| #include <utility> | ||||||||||
| #include <vector> | ||||||||||
|
|
||||||||||
| #include "block_config.h" | ||||||||||
| #include "cascader_options.h" | ||||||||||
| #include "common.h" | ||||||||||
| #include "graph.h" | ||||||||||
|
|
@@ -70,6 +71,21 @@ std::vector<std::vector<T>> EnumerateCombinations(std::vector<std::vector<T>> va | |||||||||
| return new_combs; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| float GetTransferEfficiency(const Tensor& tensor, const std::vector<int>& block_shape, | ||||||||||
| const MemoryRegion& memory) { | ||||||||||
| // The block_shape represents the shape of the data transfer required for each job. This is used | ||||||||||
| // to calculate how much of the block_shape is contiguous in memory (source memory for a read or | ||||||||||
| // destination memory for a write) and subsequently calculate how efficient each memory burst is. | ||||||||||
| const auto& shape = tensor->GetShape(); | ||||||||||
| int burst_length = block_shape[block_shape.size() - 1]; | ||||||||||
| if (block_shape[block_shape.size() - 1] == shape[shape.size() - 1]) { | ||||||||||
| burst_length *= block_shape[block_shape.size() - 2]; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| burst_length *= tensor->GetDataType().bytes(); | ||||||||||
| return static_cast<float>(memory->burst_length) / std::min(burst_length, memory->burst_length); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| std::vector<bool> GetCascadableAxes(const Part& part) { | ||||||||||
| std::vector<bool> cascadable_axes(part->GetOutputTensor()->GetShape().size()); | ||||||||||
| // Check all the propagators to see if an output axis is projected into any | ||||||||||
|
|
@@ -322,6 +338,7 @@ std::vector<Plan> GenerateSinglePlans( | |||||||||
| int bandwidth_cycles = 0; | ||||||||||
| int compute_cycles = 0; | ||||||||||
| int mem2mem_cycles = 0; | ||||||||||
| int initial_mem2mem_cycles = 0; | ||||||||||
|
|
||||||||||
| // Pick the correct performance info based on the BufferMode | ||||||||||
| PerformanceInfo perf_info; | ||||||||||
|
|
@@ -332,32 +349,52 @@ std::vector<Plan> GenerateSinglePlans( | |||||||||
| } | ||||||||||
| // Calculate the bandwidth cycles by multiplying the bytes read/written by the | ||||||||||
| // bandwidth of the memories | ||||||||||
| BlockConfig block_config = perf_info->block_config; | ||||||||||
| for (size_t i = 0; i < input_configs.size(); i++) { | ||||||||||
| bandwidth_cycles += | ||||||||||
| perf_info->read_bytes[i] / input_configs[i]->GetCopyRegion()->read_bandwidth; | ||||||||||
| Tensor tensor = input_configs[i]->GetTensor(); | ||||||||||
| MemoryRegion home_region = input_configs[i]->GetHomeRegion(); | ||||||||||
| MemoryRegion copy_region = input_configs[i]->GetCopyRegion(); | ||||||||||
|
|
||||||||||
| if (input_configs[i]->DoCopy()) { | ||||||||||
| // This Tensor needs to be copied - Count stripes for this config | ||||||||||
| Tensor tensor = input_configs[i]->GetTensor(); | ||||||||||
| for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) { | ||||||||||
| std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true); | ||||||||||
| bool first_block = true; | ||||||||||
| for (const auto& block : input_blocks) { | ||||||||||
| int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() * | ||||||||||
| tensor->GetCompressionRatio() * block.second; | ||||||||||
| int read_cycles = | ||||||||||
| bytes_transferred * input_configs[i]->GetHomeRegion()->read_bandwidth; | ||||||||||
| int write_cycles = | ||||||||||
| bytes_transferred * input_configs[i]->GetCopyRegion()->write_bandwidth; | ||||||||||
| int read_cycles = bytes_transferred * home_region->read_bandwidth + | ||||||||||
| input_configs[i]->GetHomeRegion()->read_latency; | ||||||||||
|
Comment on lines
+366
to
+367
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
|
||||||||||
| int write_cycles = bytes_transferred * copy_region->write_bandwidth; | ||||||||||
|
|
||||||||||
| if (first_block) { | ||||||||||
| first_block = false; | ||||||||||
| initial_mem2mem_cycles += std::max(read_cycles, write_cycles); | ||||||||||
| } | ||||||||||
| mem2mem_cycles += std::max(read_cycles, write_cycles); | ||||||||||
| } | ||||||||||
| } | ||||||||||
| } | ||||||||||
| float read_efficiency = | ||||||||||
| GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region); | ||||||||||
| bandwidth_cycles += | ||||||||||
| (perf_info->read_bytes[i] / copy_region->read_bandwidth) * read_efficiency; | ||||||||||
| } | ||||||||||
| MemoryRegion write_region = output_config->GetCopyRegion(); | ||||||||||
| float write_efficiency = GetTransferEfficiency( | ||||||||||
| output_config->GetTensor(), block_config->GetOutputBlockShape(), write_region); | ||||||||||
|
|
||||||||||
| bandwidth_cycles += | ||||||||||
| perf_info->write_bytes / output_config->GetCopyRegion()->write_bandwidth; | ||||||||||
| perf_info->write_bytes / write_region->write_bandwidth * write_efficiency; | ||||||||||
| compute_cycles = perf_info->compute_cycles; | ||||||||||
| // Take the max of compute and bandwidth cycles as we assume compute cycles | ||||||||||
| // can hide memory latency | ||||||||||
| int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles); | ||||||||||
| if (cycles > mem2mem_cycles) { | ||||||||||
| // NPU cycles are the bottleneck - add initial mem2mem transfer cycles | ||||||||||
| cycles += initial_mem2mem_cycles; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| int memory_usage = | ||||||||||
| GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region); | ||||||||||
| plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group, | ||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,19 +52,29 @@ class MemoryRegionNode : public Object { | |
| int read_bandwidth; | ||
| /*! \brief The write bandwidth of the region in bytes per cycle */ | ||
| int write_bandwidth; | ||
| /*! \brief The read bandwidth of the region in bytes per cycle */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Update the docstring |
||
| int read_latency; | ||
| /*! \brief The write bandwidth of the region in bytes per cycle */ | ||
| int write_latency; | ||
| /*! \brief Length of memory burst */ | ||
| int burst_length; | ||
|
|
||
| static constexpr const char* _type_key = "contrib.ethosu.cascader.MemoryRegion"; | ||
| TVM_DECLARE_FINAL_OBJECT_INFO(MemoryRegionNode, Object) | ||
| }; | ||
|
|
||
| class MemoryRegion : public ObjectRef { | ||
| public: | ||
| MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth) { | ||
| MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth, | ||
| int read_latency, int write_latency, int burst_length) { | ||
| auto n = make_object<MemoryRegionNode>(); | ||
| n->name = name; | ||
| n->size = size; | ||
| n->read_bandwidth = read_bandwidth; | ||
| n->write_bandwidth = write_bandwidth; | ||
| n->read_latency = read_latency; | ||
| n->write_latency = write_latency; | ||
| n->burst_length = burst_length; | ||
| data_ = std::move(n); | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe use the larger of the input blocks?