diff --git a/tech_reports/memory/allocator.md b/tech_reports/memory/allocator.md index 135baf4b3f9a..45048c44401b 100644 --- a/tech_reports/memory/allocator.md +++ b/tech_reports/memory/allocator.md @@ -75,6 +75,12 @@ These reports can be enabled from C++ and Python. // API to dump state of memory for a given device. Optional prefix will be prepended to the report. DumpDeviceMemoryState(const Device *device, std::string prefix=""); +// API to get dram memory view for a given device +GetDramMemoryView(const Device *device); + +// API to get l1 memory view for a given device +GetL1MemoryView(const Device *device); + // APIs to enable/disable memory reports for each Program's compile EnableMemoryReports(); DisableMemoryReports(); diff --git a/tt_metal/detail/reports/memory_reporter.cpp b/tt_metal/detail/reports/memory_reporter.cpp index abb5936c6701..403daba4ad76 100644 --- a/tt_metal/detail/reports/memory_reporter.cpp +++ b/tt_metal/detail/reports/memory_reporter.cpp @@ -144,10 +144,39 @@ void MemoryReporter::init_reports() { write_headers( this->program_memory_usage_summary_report_, this->program_l1_usage_summary_report_, /*add_program_id=*/true); } + void DumpDeviceMemoryState(const Device* device, const std::string& prefix) { MemoryReporter::inst().dump_memory_usage_state(device, std::move(prefix)); } +MemoryView get_memory_view(const Device* device, const BufferType& buffer_type) { + auto stats = device->get_memory_allocation_statistics(buffer_type); + auto num_banks_ = device->num_banks(buffer_type); + + return MemoryView{ + .num_banks = num_banks_, + .bytes_allocatable_per_bank = stats.total_allocatable_size_bytes, + .bytes_allocated_per_bank = stats.total_allocated_bytes, + .bytes_free_per_bank = stats.total_free_bytes, + .total_bytes_allocatable = stats.total_allocatable_size_bytes * num_banks_, + .total_bytes_allocated = stats.total_allocated_bytes * num_banks_, + .total_bytes_free = stats.total_free_bytes * num_banks_, + .largest_contiguous_bytes_free_per_bank = stats.largest_free_block_bytes, + .blockTable = device->get_block_table(buffer_type)}; +} + +MemoryView MemoryReporter::get_dram_memory_view(const Device* device) const { + return get_memory_view(device, BufferType::DRAM); +} + +MemoryView MemoryReporter::get_l1_memory_view(const Device* device) const { + return get_memory_view(device, BufferType::L1); +} + +MemoryView GetDramMemoryView(const Device* device) { return MemoryReporter::inst().get_dram_memory_view(device); } + +MemoryView GetL1MemoryView(const Device* device) { return MemoryReporter::inst().get_l1_memory_view(device); } + bool MemoryReporter::enabled() { return is_enabled_; } void MemoryReporter::toggle(bool state) { is_enabled_ = state; } diff --git a/tt_metal/detail/reports/memory_reporter.hpp b/tt_metal/detail/reports/memory_reporter.hpp index 10ae811dad7b..7834b37a59ba 100644 --- a/tt_metal/detail/reports/memory_reporter.hpp +++ b/tt_metal/detail/reports/memory_reporter.hpp @@ -8,6 +8,9 @@ #include #include #include +#include +#include + namespace tt::tt_metal { inline namespace v0 { @@ -16,6 +19,7 @@ class Device; } // namespace v0 namespace detail { +struct MemoryView; /** * Enable generation of reports for memory allocation statistics. @@ -60,6 +64,59 @@ void DisableMemoryReports(); * */ void DumpDeviceMemoryState(const Device* device, const std::string& prefix = ""); +/** + * Populates MemoryView for DRAM. Used when storing to disk is not an option. + * + * num_banks: total number of dram banks for given device + * bytes_allocatable_per_bank: total allocatable size per bank of dram in bytes + * bytes_allocated_per_bank: currently allocated size per bank of dram in bytes + * bytes_free_per_bank: total free size per bank of dram in bytes + * total_bytes_allocatable: total allocatable size of dram in bytes + * total_bytes_allocated: currently allocated size of dram in bytes + * total_bytes_free: total free size of dram in bytes + * largest_contiguous_bytes_free_per_bank: largest contiguous free block of dram in bytes + * blockTable: list of all blocks in dram (blockID, address, size, prevID, nextID, allocated) + * + * std::vector>: list of all blocks in dram (blockID, address, size, + * prevID, nextID, allocated) + * + * | Argument | Description | Type | Valid Range | Required | + * |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------| + * | device | The device for which memory stats will be dumped. | const Device * | | True | + * */ +MemoryView GetDramMemoryView(const Device* device); + +/** + * Populates MemoryView for L1. Used when storing to disk is not an option. + * + * num_banks: total number of dram banks for given device + * bytes_allocatable_per_bank: total allocatable size per bank of dram in bytes + * bytes_allocated_per_bank: currently allocated size per bank of dram in bytes + * bytes_free_per_bank: total free size per bank of dram in bytes + * total_bytes_allocatable: total allocatable size of dram in bytes + * total_bytes_allocated: currently allocated size of dram in bytes + * total_bytes_free: total free size of dram in bytes + * largest_contiguous_bytes_free_per_bank: largest contiguous free block of dram in bytes + * blockTable: list of all blocks in dram (blockID, address, size, prevID, nextID, allocated) + * + * | Argument | Description | Type | Valid Range | Required | + * |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------| + * | device | The device for which memory stats will be dumped. | const Device * | | True | + * */ +MemoryView GetL1MemoryView(const Device* device); + +struct MemoryView { + std::uint64_t num_banks; + size_t bytes_allocatable_per_bank; + size_t bytes_allocated_per_bank; + size_t bytes_free_per_bank; + size_t total_bytes_allocatable; // bytes_allocatable_per_bank * num_banks + size_t total_bytes_allocated; // bytes_allocated_per_bank * num_banks + size_t total_bytes_free; // bytes_free_per_bank * num_banks + size_t largest_contiguous_bytes_free_per_bank; + std::vector> blockTable; +}; + class MemoryReporter { public: MemoryReporter& operator=(const MemoryReporter&) = delete; @@ -70,6 +127,8 @@ class MemoryReporter { void flush_program_memory_usage(uint64_t program_id, const Device* device); void dump_memory_usage_state(const Device* device, const std::string& prefix = "") const; + MemoryView get_dram_memory_view(const Device* device) const; + MemoryView get_l1_memory_view(const Device* device) const; static void toggle(bool state); static MemoryReporter& inst(); diff --git a/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp b/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp index 6eec0150db8d..189f307faf0b 100644 --- a/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp +++ b/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp @@ -65,6 +65,8 @@ class Algorithm { virtual void dump_blocks(std::ostream& out) const = 0; + virtual std::vector> get_block_table() const = 0; + virtual void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) = 0; virtual void reset_size() = 0; diff --git a/tt_metal/impl/allocator/algorithms/free_list.cpp b/tt_metal/impl/allocator/algorithms/free_list.cpp index adc75c102e09..a81397e1c509 100644 --- a/tt_metal/impl/allocator/algorithms/free_list.cpp +++ b/tt_metal/impl/allocator/algorithms/free_list.cpp @@ -446,6 +446,24 @@ void FreeList::dump_blocks(std::ostream& out) const { out << "\n"; } +std::vector> FreeList::get_block_table() const { + std::vector> blocks; + boost::local_shared_ptr curr_block = this->block_head_; + + while (curr_block != nullptr) { + std::unordered_map blockEntry; + + blockEntry["address"] = std::to_string(curr_block->address + this->offset_bytes_); /// bytes + blockEntry["size"] = std::to_string(curr_block->size); // bytes + blockEntry["allocated"] = this->is_allocated(curr_block) ? "yes" : "no"; + + blocks.push_back(blockEntry); + curr_block = curr_block->next_block; + } + + return blocks; +} + void FreeList::shrink_size(DeviceAddr shrink_size, bool bottom_up) { if (shrink_size == 0) { return; diff --git a/tt_metal/impl/allocator/algorithms/free_list.hpp b/tt_metal/impl/allocator/algorithms/free_list.hpp index 467fc8c3ecea..bf79ccd0d0b2 100644 --- a/tt_metal/impl/allocator/algorithms/free_list.hpp +++ b/tt_metal/impl/allocator/algorithms/free_list.hpp @@ -38,6 +38,8 @@ class FreeList : public Algorithm { void dump_blocks(std::ostream& out) const; + std::vector> get_block_table() const; + void shrink_size(DeviceAddr shrink_size, bool bottom_up = true); void reset_size(); diff --git a/tt_metal/impl/allocator/algorithms/free_list_opt.cpp b/tt_metal/impl/allocator/algorithms/free_list_opt.cpp index bd24c0c871b8..b80ed06a531e 100644 --- a/tt_metal/impl/allocator/algorithms/free_list_opt.cpp +++ b/tt_metal/impl/allocator/algorithms/free_list_opt.cpp @@ -466,6 +466,28 @@ void FreeListOpt::dump_blocks(std::ostream& out) const { } } +std::vector> FreeListOpt::get_block_table() const { + std::vector> blocks; + + for (size_t i = 0; i < block_address_.size(); i++) { + std::unordered_map blockEntry; + + if (!meta_block_is_allocated_[i]) { + continue; + } + + blockEntry["blockID"] = std::to_string(i); + blockEntry["address"] = std::to_string(block_address_[i]); // bytes + blockEntry["size"] = std::to_string(block_size_[i]); // bytes + blockEntry["prevID"] = std::to_string(block_prev_block_[i]); + blockEntry["nextID"] = std::to_string(block_next_block_[i]); + blockEntry["allocated"] = block_is_allocated_[i] ? "yes" : "no"; + blocks.push_back(blockEntry); + } + + return blocks; +} + void FreeListOpt::shrink_size(DeviceAddr shrink_size, bool bottom_up) { if (shrink_size == 0) { return; diff --git a/tt_metal/impl/allocator/algorithms/free_list_opt.hpp b/tt_metal/impl/allocator/algorithms/free_list_opt.hpp index 1a3a7f87405b..8827efbec5c6 100644 --- a/tt_metal/impl/allocator/algorithms/free_list_opt.hpp +++ b/tt_metal/impl/allocator/algorithms/free_list_opt.hpp @@ -49,6 +49,8 @@ class FreeListOpt : public Algorithm { void dump_blocks(std::ostream& out) const override; + std::vector> get_block_table() const override; + void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) override; void reset_size() override; diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp index 059c209d7a0c..5bcf20110357 100644 --- a/tt_metal/impl/allocator/allocator.cpp +++ b/tt_metal/impl/allocator/allocator.cpp @@ -201,6 +201,15 @@ void BankManager::dump_blocks(std::ofstream& out) const { } } +std::vector> BankManager::get_block_table() const { + if (this->allocator_) { + return this->allocator_->get_block_table(); + } + + log_warning("allocator is not initialized, cannot get block table for memory"); + return {}; +} + void BankManager::shrink_size(DeviceAddr shrink_size, bool bottom_up) { if (this->allocator_) { this->allocator_->shrink_size(shrink_size, bottom_up); @@ -375,6 +384,19 @@ void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_typ } } +std::vector> get_block_table( + const Allocator& allocator, const BufferType& buffer_type) { + switch (buffer_type) { + case BufferType::DRAM: return allocator.dram_manager.get_block_table(); + case BufferType::L1: return allocator.l1_manager.get_block_table(); + case BufferType::L1_SMALL: return allocator.l1_small_manager.get_block_table(); + case BufferType::TRACE: return allocator.trace_buffer_manager.get_block_table(); + default: { + TT_THROW("Unsupported buffer type!"); + } + } +} + std::optional lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id) { // l1_manager always sits below l1_small_manager in the address space, so there is no need to check l1_small_manager return allocator.l1_manager.lowest_occupied_address(bank_id); diff --git a/tt_metal/impl/allocator/allocator.hpp b/tt_metal/impl/allocator/allocator.hpp index a3eb7f05ed06..30e7807cf3be 100644 --- a/tt_metal/impl/allocator/allocator.hpp +++ b/tt_metal/impl/allocator/allocator.hpp @@ -76,6 +76,8 @@ class BankManager { void dump_blocks(std::ofstream& out) const; + std::vector> get_block_table() const; + void shrink_size(DeviceAddr shrink_size, bool bottom_up = true); void reset_size(); @@ -122,6 +124,9 @@ Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_t void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out); +std::vector> get_block_table( + const Allocator& allocator, const BufferType& buffer_type); + std::optional lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id); DeviceAddr base_alloc( diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 2c1f685bd9d4..16453d2232f2 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -1426,6 +1426,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou return allocator::dump_memory_blocks(*allocator, buffer_type, out); } +std::vector> Device::get_block_table(const BufferType &buffer_type) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::get_block_table(*allocator, buffer_type); +} + const std::unordered_set &Device::get_allocated_buffers() const { const auto& allocator = this->get_initialized_allocator(); return allocator::get_allocated_buffers(*allocator); diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 8ddabb2d8c94..59c4dd838bf9 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -178,6 +178,8 @@ class Device { void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const; void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const; + std::vector> get_block_table(const BufferType &buffer_type) const; + // Set of logical ethernet core coordinates // core.x represents connectivity to one other chip, i.e. cores with all connect to same chip // core.y represents different channels along one diff --git a/ttnn/cpp/pybind11/device.cpp b/ttnn/cpp/pybind11/device.cpp index cb46a4510e85..c946e072fd4e 100644 --- a/ttnn/cpp/pybind11/device.cpp +++ b/ttnn/cpp/pybind11/device.cpp @@ -104,6 +104,20 @@ void py_device_module_types(py::module& m_device) { py::class_(m_device, "SubDeviceId", "ID of a sub-device."); py::class_(m_device, "SubDeviceManagerId", "ID of a sub-device manager."); + + py::class_( + m_device, "MemoryView", "Class representing view of the memory (dram or l1)") + .def_readonly("num_banks", &tt::tt_metal::detail::MemoryView::num_banks) + .def_readonly("bytes_allocatable_per_bank", &tt::tt_metal::detail::MemoryView::bytes_allocatable_per_bank) + .def_readonly("bytes_allocated_per_bank", &tt::tt_metal::detail::MemoryView::bytes_allocated_per_bank) + .def_readonly("bytes_free_per_bank", &tt::tt_metal::detail::MemoryView::bytes_free_per_bank) + .def_readonly("total_bytes_allocatable", &tt::tt_metal::detail::MemoryView::total_bytes_allocatable) + .def_readonly("total_bytes_allocated", &tt::tt_metal::detail::MemoryView::total_bytes_allocated) + .def_readonly("total_bytes_free", &tt::tt_metal::detail::MemoryView::total_bytes_free) + .def_readonly( + "largest_contiguous_bytes_free_per_bank", + &tt::tt_metal::detail::MemoryView::largest_contiguous_bytes_free_per_bank) + .def_readonly("blockTable", &tt::tt_metal::detail::MemoryView::blockTable); } void device_module(py::module& m_device) { @@ -518,6 +532,34 @@ void device_module(py::module& m_device) { +------------------+----------------------------------+-----------------------+-------------+----------+ )doc"); + m_device.def( + "GetDramMemoryView", + &tt::tt_metal::detail::GetDramMemoryView, + py::arg().noconvert(), + R"doc( + Populates MemoryView for DRAM. Used when storing to disk is not an option. + + +------------------+----------------------------------+-----------------------+-------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +==================+==================================+=======================+=============+==========+ + | device | Device to dump memory state for | ttnn.Device | | Yes | + +------------------+----------------------------------+-----------------------+-------------+----------+ + )doc"); + + m_device.def( + "GetL1MemoryView", + &tt::tt_metal::detail::GetL1MemoryView, + py::arg().noconvert(), + R"doc( + Populates MemoryView for L1. Used when storing to disk is not an option. + + +------------------+----------------------------------+-----------------------+-------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +==================+==================================+=======================+=============+==========+ + | device | Device to dump memory state for | ttnn.Device | | Yes | + +------------------+----------------------------------+-----------------------+-------------+----------+ + )doc"); + m_device.def( "synchronize_device", [](Device* device, const std::optional cq_id, const std::vector& sub_device_ids) { diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py index c5aece10e0a2..681569b8b4fd 100644 --- a/ttnn/ttnn/__init__.py +++ b/ttnn/ttnn/__init__.py @@ -173,6 +173,8 @@ def manage_config(name, value): manage_device, synchronize_device, dump_device_memory_state, + get_dram_memory_view, + get_l1_memory_view, GetPCIeDeviceID, GetNumPCIeDevices, GetNumAvailableDevices, diff --git a/ttnn/ttnn/device.py b/ttnn/ttnn/device.py index 6cbfaa85ead7..ab6c12d44cf8 100644 --- a/ttnn/ttnn/device.py +++ b/ttnn/ttnn/device.py @@ -134,6 +134,14 @@ def dump_device_memory_state(device, prefix=""): ttnn._ttnn.device.DumpDeviceMemoryState(device, prefix) +def get_dram_memory_view(device): + return ttnn._ttnn.device.GetDramMemoryView(device) + + +def get_l1_memory_view(device): + return ttnn._ttnn.device.GetL1MemoryView(device) + + def is_wormhole_b0(device=None): if device is not None: return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0