Skip to content

Commit

Permalink
#16367: Added support to enable dram and l1 memory collection without…
Browse files Browse the repository at this point in the history
… saving to disk
  • Loading branch information
tapspatel committed Dec 31, 2024
1 parent ee162e9 commit eb4efbd
Show file tree
Hide file tree
Showing 15 changed files with 231 additions and 0 deletions.
6 changes: 6 additions & 0 deletions tech_reports/memory/allocator.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ These reports can be enabled from C++ and Python.
// API to dump state of memory for a given device. Optional prefix will be prepended to the report.
DumpDeviceMemoryState(const Device *device, std::string prefix="");

// API to get dram memory view for a given device
GetDramMemoryView(const Device *device);

// API to get l1 memory view for a given device
GetL1MemoryView(const Device *device);

// APIs to enable/disable memory reports for each Program's compile
EnableMemoryReports();
DisableMemoryReports();
Expand Down
29 changes: 29 additions & 0 deletions tt_metal/detail/reports/memory_reporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,39 @@ void MemoryReporter::init_reports() {
write_headers(
this->program_memory_usage_summary_report_, this->program_l1_usage_summary_report_, /*add_program_id=*/true);
}

void DumpDeviceMemoryState(const Device* device, const std::string& prefix) {
MemoryReporter::inst().dump_memory_usage_state(device, std::move(prefix));
}

MemoryView get_memory_view(const Device* device, const BufferType& buffer_type) {
auto stats = device->get_memory_allocation_statistics(buffer_type);
auto num_banks_ = device->num_banks(buffer_type);

return MemoryView{
.num_banks = num_banks_,
.total_allocatable_per_bank_size_bytes = stats.total_allocatable_size_bytes,
.total_allocated_per_bank_size_bytes = stats.total_allocated_bytes,
.total_free_per_bank_size_bytes = stats.total_free_bytes,
.total_allocatable_size_bytes = stats.total_allocatable_size_bytes * num_banks_,
.total_allocated_size_bytes = stats.total_allocated_bytes * num_banks_,
.total_free_size_bytes = stats.total_free_bytes * num_banks_,
.largest_contiguous_free_block_per_bank_size_bytes = stats.largest_free_block_bytes,
.blockTable = device->get_block_table(buffer_type)};
}

MemoryView MemoryReporter::get_dram_memory_view(const Device* device) const {
return get_memory_view(device, BufferType::DRAM);
}

MemoryView MemoryReporter::get_l1_memory_view(const Device* device) const {
return get_memory_view(device, BufferType::L1);
}

MemoryView GetDramMemoryView(const Device* device) { return MemoryReporter::inst().get_dram_memory_view(device); }

MemoryView GetL1MemoryView(const Device* device) { return MemoryReporter::inst().get_l1_memory_view(device); }

bool MemoryReporter::enabled() { return is_enabled_; }

void MemoryReporter::toggle(bool state) { is_enabled_ = state; }
Expand Down
59 changes: 59 additions & 0 deletions tt_metal/detail/reports/memory_reporter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
#include <atomic>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>

namespace tt::tt_metal {
inline namespace v0 {

Expand All @@ -16,6 +19,7 @@ class Device;

} // namespace v0
namespace detail {
struct MemoryView;

/**
* Enable generation of reports for memory allocation statistics.
Expand Down Expand Up @@ -60,6 +64,59 @@ void DisableMemoryReports();
* */
void DumpDeviceMemoryState(const Device* device, const std::string& prefix = "");

/**
* Populates MemoryView for DRAM. Used when storing to disk is not an option.
*
* num_banks: total number of dram banks for given device
* total_allocatable_per_bank_size_bytes: total allocatable size per bank of dram in bytes
* total_allocated_per_bank_size_bytes: currently allocated size per bank of dram in bytes
* total_free_per_bank_size_bytes: total free size per bank of dram in bytes
* total_allocatable_size_bytes: total allocatable size of dram in bytes
* total_allocated_size_bytes: currently allocated size of dram in bytes
* total_free_size_bytes: total free size of dram in bytes
* largest_contiguous_free_block_per_bank_size_bytes: largest contiguous free block of dram in bytes
* blockTable: list of all blocks in dram (blockID, address, size, prevID, nextID, allocated)
*
* std::vector<std::unordered_map<std::string, std::string>>: list of all blocks in dram (blockID, address, size,
* prevID, nextID, allocated)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------|
* | device | The device for which memory stats will be dumped. | const Device * | | True |
* */
MemoryView GetDramMemoryView(const Device* device);

/**
* Populates MemoryView for L1. Used when storing to disk is not an option.
*
* num_banks: total number of dram banks for given device
* total_allocatable_per_bank_size_bytes: total allocatable size per bank of dram in bytes
* total_allocated_per_bank_size_bytes: currently allocated size per bank of dram in bytes
* total_free_per_bank_size_bytes: total free size per bank of dram in bytes
* total_allocatable_size_bytes: total allocatable size of dram in bytes
* total_allocated_size_bytes: currently allocated size of dram in bytes
* total_free_size_bytes: total free size of dram in bytes
* largest_contiguous_free_block_per_bank_size_bytes: largest contiguous free block of dram in bytes
* blockTable: list of all blocks in dram (blockID, address, size, prevID, nextID, allocated)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------|
* | device | The device for which memory stats will be dumped. | const Device * | | True |
* */
MemoryView GetL1MemoryView(const Device* device);

struct MemoryView {
std::uint64_t num_banks;
size_t total_allocatable_per_bank_size_bytes;
size_t total_allocated_per_bank_size_bytes;
size_t total_free_per_bank_size_bytes;
size_t total_allocatable_size_bytes; // total_allocatable_per_bank_size_bytes * num_banks
size_t total_allocated_size_bytes; // total_allocated_per_bank_size_bytes * num_banks
size_t total_free_size_bytes; // total_free_per_bank_size_bytes * num_banks
size_t largest_contiguous_free_block_per_bank_size_bytes;
std::vector<std::unordered_map<std::string, std::string>> blockTable;
};

class MemoryReporter {
public:
MemoryReporter& operator=(const MemoryReporter&) = delete;
Expand All @@ -70,6 +127,8 @@ class MemoryReporter {
void flush_program_memory_usage(uint64_t program_id, const Device* device);

void dump_memory_usage_state(const Device* device, const std::string& prefix = "") const;
MemoryView get_dram_memory_view(const Device* device) const;
MemoryView get_l1_memory_view(const Device* device) const;

static void toggle(bool state);
static MemoryReporter& inst();
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class Algorithm {

virtual void dump_blocks(std::ostream& out) const = 0;

virtual std::vector<std::unordered_map<std::string, std::string>> get_block_table() const = 0;

virtual void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) = 0;

virtual void reset_size() = 0;
Expand Down
18 changes: 18 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,24 @@ void FreeList::dump_blocks(std::ostream& out) const {
out << "\n";
}

std::vector<std::unordered_map<std::string, std::string>> FreeList::get_block_table() const {
std::vector<std::unordered_map<std::string, std::string>> blocks;
boost::local_shared_ptr<Block> curr_block = this->block_head_;

while (curr_block != nullptr) {
std::unordered_map<std::string, std::string> blockEntry;

blockEntry["address"] = std::to_string(curr_block->address + this->offset_bytes_); /// bytes
blockEntry["size"] = std::to_string(curr_block->size); // bytes
blockEntry["allocated"] = this->is_allocated(curr_block) ? "yes" : "no";

blocks.push_back(blockEntry);
curr_block = curr_block->next_block;
}

return blocks;
}

void FreeList::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (shrink_size == 0) {
return;
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class FreeList : public Algorithm {

void dump_blocks(std::ostream& out) const;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);

void reset_size();
Expand Down
22 changes: 22 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list_opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,28 @@ void FreeListOpt::dump_blocks(std::ostream& out) const {
}
}

std::vector<std::unordered_map<std::string, std::string>> FreeListOpt::get_block_table() const {
std::vector<std::unordered_map<std::string, std::string>> blocks;

for (size_t i = 0; i < block_address_.size(); i++) {
std::unordered_map<std::string, std::string> blockEntry;

if (!meta_block_is_allocated_[i]) {
continue;
}

blockEntry["blockID"] = std::to_string(i);
blockEntry["address"] = std::to_string(block_address_[i]); // bytes
blockEntry["size"] = std::to_string(block_size_[i]); // bytes
blockEntry["prevID"] = std::to_string(block_prev_block_[i]);
blockEntry["nextID"] = std::to_string(block_next_block_[i]);
blockEntry["allocated"] = block_is_allocated_[i] ? "yes" : "no";
blocks.push_back(blockEntry);
}

return blocks;
}

void FreeListOpt::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (shrink_size == 0) {
return;
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list_opt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class FreeListOpt : public Algorithm {

void dump_blocks(std::ostream& out) const override;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const override;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) override;

void reset_size() override;
Expand Down
22 changes: 22 additions & 0 deletions tt_metal/impl/allocator/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,15 @@ void BankManager::dump_blocks(std::ofstream& out) const {
}
}

std::vector<std::unordered_map<std::string, std::string>> BankManager::get_block_table() const {
if (this->allocator_) {
return this->allocator_->get_block_table();
}

log_warning("allocator is not initialized, cannot get block table for memory");
return {};
}

void BankManager::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (this->allocator_) {
this->allocator_->shrink_size(shrink_size, bottom_up);
Expand Down Expand Up @@ -375,6 +384,19 @@ void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_typ
}
}

std::vector<std::unordered_map<std::string, std::string>> get_block_table(
const Allocator& allocator, const BufferType& buffer_type) {
switch (buffer_type) {
case BufferType::DRAM: return allocator.dram_manager.get_block_table();
case BufferType::L1: return allocator.l1_manager.get_block_table();
case BufferType::L1_SMALL: return allocator.l1_small_manager.get_block_table();
case BufferType::TRACE: return allocator.trace_buffer_manager.get_block_table();
default: {
TT_THROW("Unsupported buffer type!");
}
}
}

std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id) {
// l1_manager always sits below l1_small_manager in the address space, so there is no need to check l1_small_manager
return allocator.l1_manager.lowest_occupied_address(bank_id);
Expand Down
5 changes: 5 additions & 0 deletions tt_metal/impl/allocator/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class BankManager {

void dump_blocks(std::ofstream& out) const;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);
void reset_size();

Expand Down Expand Up @@ -122,6 +124,9 @@ Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_t

void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);

std::vector<std::unordered_map<std::string, std::string>> get_block_table(
const Allocator& allocator, const BufferType& buffer_type);

std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id);

DeviceAddr base_alloc(
Expand Down
5 changes: 5 additions & 0 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1426,6 +1426,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou
return allocator::dump_memory_blocks(*allocator, buffer_type, out);
}

std::vector<std::unordered_map<std::string, std::string>> Device::get_block_table(const BufferType &buffer_type) const {
const auto& allocator = this->get_initialized_allocator();
return allocator::get_block_table(*allocator, buffer_type);
}

const std::unordered_set<Buffer *> &Device::get_allocated_buffers() const {
const auto& allocator = this->get_initialized_allocator();
return allocator::get_allocated_buffers(*allocator);
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ class Device {
void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const;
void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const;

std::vector<std::unordered_map<std::string, std::string>> get_block_table(const BufferType &buffer_type) const;

// Set of logical ethernet core coordinates
// core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
// core.y represents different channels along one <x>
Expand Down
47 changes: 47 additions & 0 deletions ttnn/cpp/pybind11/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,25 @@ void py_device_module_types(py::module& m_device) {
py::class_<SubDeviceId>(m_device, "SubDeviceId", "ID of a sub-device.");

py::class_<SubDeviceManagerId>(m_device, "SubDeviceManagerId", "ID of a sub-device manager.");

py::class_<tt::tt_metal::detail::MemoryView>(
m_device, "MemoryView", "Class representing view of the memory (dram or l1)")
.def_readonly("num_banks", &tt::tt_metal::detail::MemoryView::num_banks)
.def_readonly(
"total_allocatable_per_bank_size_bytes",
&tt::tt_metal::detail::MemoryView::total_allocatable_per_bank_size_bytes)
.def_readonly(
"total_allocated_per_bank_size_bytes",
&tt::tt_metal::detail::MemoryView::total_allocated_per_bank_size_bytes)
.def_readonly(
"total_free_per_bank_size_bytes", &tt::tt_metal::detail::MemoryView::total_free_per_bank_size_bytes)
.def_readonly("total_allocatable_size_bytes", &tt::tt_metal::detail::MemoryView::total_allocatable_size_bytes)
.def_readonly("total_allocated_size_bytes", &tt::tt_metal::detail::MemoryView::total_allocated_size_bytes)
.def_readonly("total_free_size_bytes", &tt::tt_metal::detail::MemoryView::total_free_size_bytes)
.def_readonly(
"largest_contiguous_free_block_per_bank_size_bytes",
&tt::tt_metal::detail::MemoryView::largest_contiguous_free_block_per_bank_size_bytes)
.def_readonly("blockTable", &tt::tt_metal::detail::MemoryView::blockTable);
}

void device_module(py::module& m_device) {
Expand Down Expand Up @@ -518,6 +537,34 @@ void device_module(py::module& m_device) {
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");

m_device.def(
"GetDramMemoryView",
&tt::tt_metal::detail::GetDramMemoryView,
py::arg().noconvert(),
R"doc(
Populates MemoryView for DRAM. Used when storing to disk is not an option.
+------------------+----------------------------------+-----------------------+-------------+----------+
| Argument | Description | Data type | Valid range | Required |
+==================+==================================+=======================+=============+==========+
| device | Device to dump memory state for | ttnn.Device | | Yes |
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");

m_device.def(
"GetL1MemoryView",
&tt::tt_metal::detail::GetL1MemoryView,
py::arg().noconvert(),
R"doc(
Populates MemoryView for L1. Used when storing to disk is not an option.
+------------------+----------------------------------+-----------------------+-------------+----------+
| Argument | Description | Data type | Valid range | Required |
+==================+==================================+=======================+=============+==========+
| device | Device to dump memory state for | ttnn.Device | | Yes |
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");

m_device.def(
"synchronize_device",
[](Device* device, const std::optional<uint8_t> cq_id, const std::vector<SubDeviceId>& sub_device_ids) {
Expand Down
2 changes: 2 additions & 0 deletions ttnn/ttnn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ def manage_config(name, value):
manage_device,
synchronize_device,
dump_device_memory_state,
get_dram_memory_view,
get_l1_memory_view,
GetPCIeDeviceID,
GetNumPCIeDevices,
GetNumAvailableDevices,
Expand Down
8 changes: 8 additions & 0 deletions ttnn/ttnn/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ def dump_device_memory_state(device, prefix=""):
ttnn._ttnn.device.DumpDeviceMemoryState(device, prefix)


def get_dram_memory_view(device):
return ttnn._ttnn.device.GetDramMemoryView(device)


def get_l1_memory_view(device):
return ttnn._ttnn.device.GetL1MemoryView(device)


def is_wormhole_b0(device=None):
if device is not None:
return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0
Expand Down

0 comments on commit eb4efbd

Please sign in to comment.