Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions python/tvm/contrib/ethosu/cascader/block_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,21 @@
class BlockConfig(Object):
"""BlockConfig class"""

def __init__(self, output_shape: List[int], compute_cycles: int, output_cycles: int):
def __init__(
self,
input_shape: List[int],
output_shape: List[int],
compute_cycles: int,
output_cycles: int,
):
self.__init_handle_by_constructor__(
_ffi_api.BlockConfig, output_shape, compute_cycles, output_cycles
_ffi_api.BlockConfig, input_shape, output_shape, compute_cycles, output_cycles
)

@property
def input_shape(self) -> List[int]:
return list(self._input_shape)

@property
def output_shape(self) -> List[int]:
return list(self._output_shape)
Expand Down
7 changes: 4 additions & 3 deletions python/tvm/contrib/ethosu/cascader/device_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ def get_elementwise_block_config(
)
output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
output_cycles = int(math.ceil(output_cycles))
block_config.append(BlockConfig(output_block, 0, output_cycles))
block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use the larger of the input blocks?

break

if output_block[split_axis] == 1:
Expand Down Expand Up @@ -738,9 +738,10 @@ def get_valid_block_configs(
ifm_channels,
is_partkernel,
)
valid_block_configs.append(
BlockConfig(output_block, compute_cycles, output_cycles)
block_config = BlockConfig(
input_block_shape.as_list(), output_block, compute_cycles, output_cycles
)
valid_block_configs.append(block_config)
else:
# Block config does not fit into SHRAM
# Any Block config that is strictly larger than this one will also fail
Expand Down
4 changes: 4 additions & 0 deletions python/tvm/contrib/ethosu/cascader/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ def read_bytes(self):
def write_bytes(self):
return self._write_bytes

@property
def block_config(self):
return self._block_config


@tvm._ffi.register_object("contrib.ethosu.cascader.Tensor")
class Tensor(Object):
Expand Down
20 changes: 18 additions & 2 deletions python/tvm/contrib/ethosu/cascader/tensor_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,25 @@ class MemoryRegion(Object):

"""

def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: int):
def __init__(
self,
name: str,
size: int,
read_bandwidth: int,
write_bandwidth: int,
read_latency: int = 0,
write_latency: int = 0,
burst_length: int = 1,
Comment on lines +67 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: add these to the docstring as well

):
self.__init_handle_by_constructor__(
_ffi_api.MemoryRegion, name, size, read_bandwidth, write_bandwidth
_ffi_api.MemoryRegion,
name,
size,
read_bandwidth,
write_bandwidth,
read_latency,
write_latency,
burst_length,
)


Expand Down
15 changes: 10 additions & 5 deletions src/contrib/ethosu/cascader/block_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,28 @@ namespace ethosu {
namespace cascader {

void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
Array<Integer> tmp_arr = make_array(output_shape_);
Array<Integer> tmp_arr = make_array(input_shape_);
v->Visit("_input_shape", &tmp_arr);
tmp_arr = make_array(output_shape_);
v->Visit("_output_shape", &tmp_arr);
}

BlockConfig::BlockConfig(const std::vector<int>& output_shape, int compute_cycles,
int output_cycles) {
BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
int compute_cycles, int output_cycles) {
auto n = make_object<BlockConfigNode>();
n->input_shape_ = std::move(input_shape);
n->output_shape_ = std::move(output_shape);
n->compute_cycles_ = compute_cycles;
n->output_cycles_ = output_cycles;
data_ = std::move(n);
}

TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.BlockConfig")
.set_body_typed([](Array<Integer> output_shape, int compute_cycles, int output_cycles) {
.set_body_typed([](Array<Integer> input_shape, Array<Integer> output_shape, int compute_cycles,
int output_cycles) {
std::vector<int> vinput_shape = make_vector<int, Integer>(input_shape);
std::vector<int> voutput_shape = make_vector<int, Integer>(output_shape);
return BlockConfig(voutput_shape, compute_cycles, output_cycles);
return BlockConfig(vinput_shape, voutput_shape, compute_cycles, output_cycles);
});

TVM_REGISTER_NODE_TYPE(BlockConfigNode);
Expand Down
11 changes: 10 additions & 1 deletion src/contrib/ethosu/cascader/block_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ class BlockConfigNode : public Object {
public:
void VisitAttrs(AttrVisitor* v);

/*!
* \brief Get the shape of input block.
* \return The input shape of the block config.
*/
inline std::vector<int> GetInputBlockShape() const { return input_shape_; }

/*!
* \brief Get the shape of output block.
* \return The output shape of the block config.
Expand All @@ -66,6 +72,8 @@ class BlockConfigNode : public Object {
protected:
friend class BlockConfig;

/*! \brief The shape of the input block */
std::vector<int> input_shape_;
/*! \brief The shape of the output block */
std::vector<int> output_shape_;
/*! \brief Cycles required to compute this block */
Expand All @@ -80,7 +88,8 @@ class BlockConfigNode : public Object {
*/
class BlockConfig : public ObjectRef {
public:
BlockConfig(const std::vector<int>& output_shape, int compute_cycles, int output_cycles);
BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
int compute_cycles, int output_cycles);

TVM_DEFINE_OBJECT_REF_METHODS(BlockConfig, ObjectRef, BlockConfigNode);
};
Expand Down
1 change: 1 addition & 0 deletions src/contrib/ethosu/cascader/graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ void PerformanceInfoNode::VisitAttrs(AttrVisitor* v) {
Array<IntImm> tmp_reads = make_array(read_bytes);
v->Visit("_read_bytes", &tmp_reads);
v->Visit("_write_bytes", &write_bytes);
v->Visit("_block_config", &block_config);
}

TVM_REGISTER_NODE_TYPE(PerformanceInfoNode);
Expand Down
7 changes: 6 additions & 1 deletion src/contrib/ethosu/cascader/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <utility>
#include <vector>

#include "block_config.h"
#include "propagator.h"

namespace tvm {
Expand Down Expand Up @@ -71,6 +72,8 @@ class PerformanceInfoNode : public Object {
std::vector<int64_t> read_bytes;
/*! \brief The number of bytes written to the output tensor */
int64_t write_bytes;
/*! \brief The block config used for this performance point */
BlockConfig block_config;

static constexpr const char* _type_key = "contrib.ethosu.cascader.PerformanceInfo";
TVM_DECLARE_FINAL_OBJECT_INFO(PerformanceInfoNode, Object);
Expand All @@ -85,11 +88,13 @@ class PerformanceInfoNode : public Object {
*/
class PerformanceInfo : public ObjectRef {
public:
PerformanceInfo(int64_t compute_cycles, std::vector<int64_t> read_bytes, int64_t write_bytes) {
PerformanceInfo(int64_t compute_cycles, std::vector<int64_t> read_bytes, int64_t write_bytes,
BlockConfig block_config) {
auto n = make_object<PerformanceInfoNode>();
n->compute_cycles = compute_cycles;
n->read_bytes = std::move(read_bytes);
n->write_bytes = write_bytes;
n->block_config = block_config;
data_ = std::move(n);
}

Expand Down
5 changes: 3 additions & 2 deletions src/contrib/ethosu/cascader/parts/ethosu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ const std::vector<int64_t> EthosuPartNode::GetBytesRead(const std::vector<int>&
for (const auto& input_block_config : input_block_configs) {
std::map<std::vector<int>, int> input_blocks = CountStripes(input_block_config, false);
for (const auto& block : input_blocks) {
bytes_per_input[i] += mul_reduce(block.first) * block.second;
bytes_per_input[i] +=
mul_reduce(block.first) * block.second * input_tensors_[i]->GetDataType().bytes();
}
i++;
}
Expand Down Expand Up @@ -136,7 +137,7 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out
total_cycles = (block_compute_cycles * num_blocks) + block_output_cycles;
}

PerformanceInfo info(total_cycles, read_bytes, write_bytes);
PerformanceInfo info(total_cycles, read_bytes, write_bytes, block_config);
return info;
}

Expand Down
4 changes: 3 additions & 1 deletion src/contrib/ethosu/cascader/parts/inline.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <utility>
#include <vector>

#include "../block_config.h"
#include "../common.h"

namespace tvm {
Expand All @@ -33,7 +34,8 @@ namespace cascader {
const PerformanceInfo InlinePartNode::GetPerformanceInfo(const StripeConfig& output_stripe_config,
BufferMode buffer_mode) {
std::vector<int64_t> read_bytes(input_tensors_.size());
PerformanceInfo info(0, read_bytes, 0);
BlockConfig block_config = BlockConfig(std::vector<int>(1, 1), std::vector<int>(1, 1), 0, 0);
PerformanceInfo info(0, read_bytes, 0, block_config);
return info;
}

Expand Down
53 changes: 45 additions & 8 deletions src/contrib/ethosu/cascader/plan_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <utility>
#include <vector>

#include "block_config.h"
#include "cascader_options.h"
#include "common.h"
#include "graph.h"
Expand Down Expand Up @@ -70,6 +71,21 @@ std::vector<std::vector<T>> EnumerateCombinations(std::vector<std::vector<T>> va
return new_combs;
}

float GetTransferEfficiency(const Tensor& tensor, const std::vector<int>& block_shape,
const MemoryRegion& memory) {
// The block_shape represents the shape of the data transfer required for each job. This is used
// to calculate how much of the block_shape is contiguous in memory (source memory for a read or
// destination memory for a write) and subsequently calculate how efficient each memory burst is.
const auto& shape = tensor->GetShape();
int burst_length = block_shape[block_shape.size() - 1];
if (block_shape[block_shape.size() - 1] == shape[shape.size() - 1]) {
burst_length *= block_shape[block_shape.size() - 2];
}

burst_length *= tensor->GetDataType().bytes();
return static_cast<float>(memory->burst_length) / std::min(burst_length, memory->burst_length);
}

std::vector<bool> GetCascadableAxes(const Part& part) {
std::vector<bool> cascadable_axes(part->GetOutputTensor()->GetShape().size());
// Check all the propagators to see if an output axis is projected into any
Expand Down Expand Up @@ -322,6 +338,7 @@ std::vector<Plan> GenerateSinglePlans(
int bandwidth_cycles = 0;
int compute_cycles = 0;
int mem2mem_cycles = 0;
int initial_mem2mem_cycles = 0;

// Pick the correct performance info based on the BufferMode
PerformanceInfo perf_info;
Expand All @@ -332,32 +349,52 @@ std::vector<Plan> GenerateSinglePlans(
}
// Calculate the bandwidth cycles by multiplying the bytes read/written by the
// bandwidth of the memories
BlockConfig block_config = perf_info->block_config;
for (size_t i = 0; i < input_configs.size(); i++) {
bandwidth_cycles +=
perf_info->read_bytes[i] / input_configs[i]->GetCopyRegion()->read_bandwidth;
Tensor tensor = input_configs[i]->GetTensor();
MemoryRegion home_region = input_configs[i]->GetHomeRegion();
MemoryRegion copy_region = input_configs[i]->GetCopyRegion();

if (input_configs[i]->DoCopy()) {
// This Tensor needs to be copied - Count stripes for this config
Tensor tensor = input_configs[i]->GetTensor();
for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) {
std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
bool first_block = true;
for (const auto& block : input_blocks) {
int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() *
tensor->GetCompressionRatio() * block.second;
int read_cycles =
bytes_transferred * input_configs[i]->GetHomeRegion()->read_bandwidth;
int write_cycles =
bytes_transferred * input_configs[i]->GetCopyRegion()->write_bandwidth;
int read_cycles = bytes_transferred * home_region->read_bandwidth +
input_configs[i]->GetHomeRegion()->read_latency;
Comment on lines +366 to +367
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
int read_cycles = bytes_transferred * home_region->read_bandwidth +
input_configs[i]->GetHomeRegion()->read_latency;
int read_cycles = bytes_transferred * home_region->read_bandwidth +
home_region->read_latency;

int write_cycles = bytes_transferred * copy_region->write_bandwidth;

if (first_block) {
first_block = false;
initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
}
mem2mem_cycles += std::max(read_cycles, write_cycles);
}
}
}
float read_efficiency =
GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region);
bandwidth_cycles +=
(perf_info->read_bytes[i] / copy_region->read_bandwidth) * read_efficiency;
}
MemoryRegion write_region = output_config->GetCopyRegion();
float write_efficiency = GetTransferEfficiency(
output_config->GetTensor(), block_config->GetOutputBlockShape(), write_region);

bandwidth_cycles +=
perf_info->write_bytes / output_config->GetCopyRegion()->write_bandwidth;
perf_info->write_bytes / write_region->write_bandwidth * write_efficiency;
compute_cycles = perf_info->compute_cycles;
// Take the max of compute and bandwidth cycles as we assume compute cycles
// can hide memory latency
int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles);
if (cycles > mem2mem_cycles) {
// NPU cycles are the bottleneck - add initial mem2mem transfer cycles
cycles += initial_mem2mem_cycles;
}

int memory_usage =
GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region);
plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group,
Expand Down
9 changes: 7 additions & 2 deletions src/contrib/ethosu/cascader/tensor_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,16 @@ void MemoryRegionNode::VisitAttrs(AttrVisitor* v) {
v->Visit("size", &size);
v->Visit("read_bandwidth", &read_bandwidth);
v->Visit("write_bandwidth", &write_bandwidth);
v->Visit("read_latency", &read_latency);
v->Visit("write_latency", &write_latency);
v->Visit("burst_length", &burst_length);
}

TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.MemoryRegion")
.set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth) {
return MemoryRegion(name, size, read_bandwidth, write_bandwidth);
.set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth,
int read_latency, int write_latency, int burst_length) {
return MemoryRegion(name, size, read_bandwidth, write_bandwidth, read_latency, write_latency,
burst_length);
});

TVM_REGISTER_NODE_TYPE(MemoryRegionNode);
Expand Down
12 changes: 11 additions & 1 deletion src/contrib/ethosu/cascader/tensor_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,29 @@ class MemoryRegionNode : public Object {
int read_bandwidth;
/*! \brief The write bandwidth of the region in bytes per cycle */
int write_bandwidth;
/*! \brief The read bandwidth of the region in bytes per cycle */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Update the docstring

int read_latency;
/*! \brief The write bandwidth of the region in bytes per cycle */
int write_latency;
/*! \brief Length of memory burst */
int burst_length;

static constexpr const char* _type_key = "contrib.ethosu.cascader.MemoryRegion";
TVM_DECLARE_FINAL_OBJECT_INFO(MemoryRegionNode, Object)
};

class MemoryRegion : public ObjectRef {
public:
MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth) {
MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth,
int read_latency, int write_latency, int burst_length) {
auto n = make_object<MemoryRegionNode>();
n->name = name;
n->size = size;
n->read_bandwidth = read_bandwidth;
n->write_bandwidth = write_bandwidth;
n->read_latency = read_latency;
n->write_latency = write_latency;
n->burst_length = burst_length;
data_ = std::move(n);
}

Expand Down
Loading