From 7bb435c0d3080ac0bc9ee64b047c763a9c353cda Mon Sep 17 00:00:00 2001 From: Tapasvi Patel Date: Mon, 30 Dec 2024 22:39:46 +0000 Subject: [PATCH] #16367: Added support to enable dram and l1 memory collection without saving to disk --- tech_reports/memory/allocator.md | 3 ++ tt_metal/detail/reports/memory_reporter.cpp | 20 ++++++++++ tt_metal/detail/reports/memory_reporter.hpp | 40 +++++++++++++++++++ .../algorithms/allocator_algorithm.hpp | 2 + .../impl/allocator/algorithms/free_list.cpp | 18 +++++++++ .../impl/allocator/algorithms/free_list.hpp | 2 + .../allocator/algorithms/free_list_opt.cpp | 22 ++++++++++ .../allocator/algorithms/free_list_opt.hpp | 2 + tt_metal/impl/allocator/allocator.cpp | 22 ++++++++++ tt_metal/impl/allocator/allocator.hpp | 5 +++ tt_metal/impl/device/device.cpp | 5 +++ tt_metal/impl/device/device.hpp | 2 + tt_metal/include/tt_metal/device.hpp | 3 ++ ttnn/cpp/pybind11/device.cpp | 30 ++++++++++++++ ttnn/cpp/pybind11/tensor.cpp | 3 +- ttnn/ttnn/__init__.py | 1 + ttnn/ttnn/device.py | 4 ++ 17 files changed, 183 insertions(+), 1 deletion(-) diff --git a/tech_reports/memory/allocator.md b/tech_reports/memory/allocator.md index 135baf4b3f9a..b1651bc0a129 100644 --- a/tech_reports/memory/allocator.md +++ b/tech_reports/memory/allocator.md @@ -75,6 +75,9 @@ These reports can be enabled from C++ and Python. // API to dump state of memory for a given device. Optional prefix will be prepended to the report. DumpDeviceMemoryState(const Device *device, std::string prefix=""); +// API to get dram memory view for a given device +GetMemoryView(const IDevice* device, const BufferType& buffer_type); + // APIs to enable/disable memory reports for each Program's compile EnableMemoryReports(); DisableMemoryReports(); diff --git a/tt_metal/detail/reports/memory_reporter.cpp b/tt_metal/detail/reports/memory_reporter.cpp index ce948932235c..715853d75cd3 100644 --- a/tt_metal/detail/reports/memory_reporter.cpp +++ b/tt_metal/detail/reports/memory_reporter.cpp @@ -148,6 +148,26 @@ void DumpDeviceMemoryState(const IDevice* device, const std::string& prefix) { MemoryReporter::inst().dump_memory_usage_state(device, std::move(prefix)); } +MemoryView MemoryReporter::get_memory_view(const IDevice* device, const BufferType& buffer_type) const { + auto stats = device->get_memory_allocation_statistics(buffer_type); + auto num_banks_ = device->num_banks(buffer_type); + + return MemoryView{ + .num_banks = num_banks_, + .bytes_allocatable_per_bank = stats.total_allocatable_size_bytes, + .bytes_allocated_per_bank = stats.total_allocated_bytes, + .bytes_free_per_bank = stats.total_free_bytes, + .total_bytes_allocatable = stats.total_allocatable_size_bytes * num_banks_, + .total_bytes_allocated = stats.total_allocated_bytes * num_banks_, + .total_bytes_free = stats.total_free_bytes * num_banks_, + .largest_contiguous_bytes_free_per_bank = stats.largest_free_block_bytes, + .block_table = device->get_block_table(buffer_type)}; +} + +MemoryView GetMemoryView(const IDevice* device, const BufferType& buffer_type) { + return MemoryReporter::inst().get_memory_view(device, buffer_type); +} + bool MemoryReporter::enabled() { return is_enabled_; } void MemoryReporter::toggle(bool state) { is_enabled_ = state; } diff --git a/tt_metal/detail/reports/memory_reporter.hpp b/tt_metal/detail/reports/memory_reporter.hpp index ebcdd97fdfe8..30db9d8c14a0 100644 --- a/tt_metal/detail/reports/memory_reporter.hpp +++ b/tt_metal/detail/reports/memory_reporter.hpp @@ -8,6 +8,11 @@ #include #include #include +#include +#include + +#include "tt_metal/impl/allocator/allocator.hpp" + namespace tt::tt_metal { inline namespace v0 { @@ -16,6 +21,7 @@ class IDevice; } // namespace v0 namespace detail { +struct MemoryView; /** * Enable generation of reports for memory allocation statistics. @@ -60,6 +66,38 @@ void DisableMemoryReports(); * */ void DumpDeviceMemoryState(const IDevice* device, const std::string& prefix = ""); +/** + * Populates MemoryView for BufferType [dram, l1, l1 small, trace]. Used when storing to disk is not an option. + * + * num_banks: total number of BufferType banks for given device + * bytes_allocatable_per_bank: total allocatable size per bank of BufferType in bytes + * bytes_allocated_per_bank: currently allocated size per bank of BufferType in bytes + * bytes_free_per_bank: total free size per bank of BufferType in bytes + * total_bytes_allocatable: total allocatable size of BufferType in bytes + * total_bytes_allocated: currently allocated size of BufferType in bytes + * total_bytes_free: total free size of BufferType in bytes + * largest_contiguous_bytes_free_per_bank: largest contiguous free block of BufferType in bytes + * block_table: list of all blocks in BufferType (blockID, address, size, prevID, nextID, allocated) + * + * | Argument | Description | Type | Valid Range | Required | + * |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------| + * | device | The device for which memory stats will be dumped. | const IDevice * | | True | + * | buffer_type | The type of buffer to populate the memory view. | const BufferType& | | True | + * */ +MemoryView GetMemoryView(const IDevice* device, const BufferType& buffer_type); + +struct MemoryView { + std::uint64_t num_banks; + size_t bytes_allocatable_per_bank; + size_t bytes_allocated_per_bank; + size_t bytes_free_per_bank; + size_t total_bytes_allocatable; // bytes_allocatable_per_bank * num_banks + size_t total_bytes_allocated; // bytes_allocated_per_bank * num_banks + size_t total_bytes_free; // bytes_free_per_bank * num_banks + size_t largest_contiguous_bytes_free_per_bank; + std::vector> block_table; +}; + class MemoryReporter { public: MemoryReporter& operator=(const MemoryReporter&) = delete; @@ -71,6 +109,8 @@ class MemoryReporter { void dump_memory_usage_state(const IDevice* device, const std::string& prefix = "") const; + MemoryView get_memory_view(const IDevice* device, const BufferType& buffer_type) const; + static void toggle(bool state); static MemoryReporter& inst(); static bool enabled(); diff --git a/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp b/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp index 6eec0150db8d..189f307faf0b 100644 --- a/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp +++ b/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp @@ -65,6 +65,8 @@ class Algorithm { virtual void dump_blocks(std::ostream& out) const = 0; + virtual std::vector> get_block_table() const = 0; + virtual void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) = 0; virtual void reset_size() = 0; diff --git a/tt_metal/impl/allocator/algorithms/free_list.cpp b/tt_metal/impl/allocator/algorithms/free_list.cpp index adc75c102e09..bf49018d3f46 100644 --- a/tt_metal/impl/allocator/algorithms/free_list.cpp +++ b/tt_metal/impl/allocator/algorithms/free_list.cpp @@ -446,6 +446,24 @@ void FreeList::dump_blocks(std::ostream& out) const { out << "\n"; } +std::vector> FreeList::get_block_table() const { + std::vector> blocks; + boost::local_shared_ptr curr_block = this->block_head_; + + while (curr_block != nullptr) { + std::unordered_map block_entry; + + block_entry["address"] = std::to_string(curr_block->address + this->offset_bytes_); // bytes + block_entry["size"] = std::to_string(curr_block->size); // bytes + block_entry["allocated"] = this->is_allocated(curr_block) ? "yes" : "no"; + + blocks.push_back(block_entry); + curr_block = curr_block->next_block; + } + + return blocks; +} + void FreeList::shrink_size(DeviceAddr shrink_size, bool bottom_up) { if (shrink_size == 0) { return; diff --git a/tt_metal/impl/allocator/algorithms/free_list.hpp b/tt_metal/impl/allocator/algorithms/free_list.hpp index 467fc8c3ecea..bf79ccd0d0b2 100644 --- a/tt_metal/impl/allocator/algorithms/free_list.hpp +++ b/tt_metal/impl/allocator/algorithms/free_list.hpp @@ -38,6 +38,8 @@ class FreeList : public Algorithm { void dump_blocks(std::ostream& out) const; + std::vector> get_block_table() const; + void shrink_size(DeviceAddr shrink_size, bool bottom_up = true); void reset_size(); diff --git a/tt_metal/impl/allocator/algorithms/free_list_opt.cpp b/tt_metal/impl/allocator/algorithms/free_list_opt.cpp index bd24c0c871b8..b80ed06a531e 100644 --- a/tt_metal/impl/allocator/algorithms/free_list_opt.cpp +++ b/tt_metal/impl/allocator/algorithms/free_list_opt.cpp @@ -466,6 +466,28 @@ void FreeListOpt::dump_blocks(std::ostream& out) const { } } +std::vector> FreeListOpt::get_block_table() const { + std::vector> blocks; + + for (size_t i = 0; i < block_address_.size(); i++) { + std::unordered_map blockEntry; + + if (!meta_block_is_allocated_[i]) { + continue; + } + + blockEntry["blockID"] = std::to_string(i); + blockEntry["address"] = std::to_string(block_address_[i]); // bytes + blockEntry["size"] = std::to_string(block_size_[i]); // bytes + blockEntry["prevID"] = std::to_string(block_prev_block_[i]); + blockEntry["nextID"] = std::to_string(block_next_block_[i]); + blockEntry["allocated"] = block_is_allocated_[i] ? "yes" : "no"; + blocks.push_back(blockEntry); + } + + return blocks; +} + void FreeListOpt::shrink_size(DeviceAddr shrink_size, bool bottom_up) { if (shrink_size == 0) { return; diff --git a/tt_metal/impl/allocator/algorithms/free_list_opt.hpp b/tt_metal/impl/allocator/algorithms/free_list_opt.hpp index 1a3a7f87405b..8827efbec5c6 100644 --- a/tt_metal/impl/allocator/algorithms/free_list_opt.hpp +++ b/tt_metal/impl/allocator/algorithms/free_list_opt.hpp @@ -49,6 +49,8 @@ class FreeListOpt : public Algorithm { void dump_blocks(std::ostream& out) const override; + std::vector> get_block_table() const override; + void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) override; void reset_size() override; diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp index 059c209d7a0c..5bcf20110357 100644 --- a/tt_metal/impl/allocator/allocator.cpp +++ b/tt_metal/impl/allocator/allocator.cpp @@ -201,6 +201,15 @@ void BankManager::dump_blocks(std::ofstream& out) const { } } +std::vector> BankManager::get_block_table() const { + if (this->allocator_) { + return this->allocator_->get_block_table(); + } + + log_warning("allocator is not initialized, cannot get block table for memory"); + return {}; +} + void BankManager::shrink_size(DeviceAddr shrink_size, bool bottom_up) { if (this->allocator_) { this->allocator_->shrink_size(shrink_size, bottom_up); @@ -375,6 +384,19 @@ void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_typ } } +std::vector> get_block_table( + const Allocator& allocator, const BufferType& buffer_type) { + switch (buffer_type) { + case BufferType::DRAM: return allocator.dram_manager.get_block_table(); + case BufferType::L1: return allocator.l1_manager.get_block_table(); + case BufferType::L1_SMALL: return allocator.l1_small_manager.get_block_table(); + case BufferType::TRACE: return allocator.trace_buffer_manager.get_block_table(); + default: { + TT_THROW("Unsupported buffer type!"); + } + } +} + std::optional lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id) { // l1_manager always sits below l1_small_manager in the address space, so there is no need to check l1_small_manager return allocator.l1_manager.lowest_occupied_address(bank_id); diff --git a/tt_metal/impl/allocator/allocator.hpp b/tt_metal/impl/allocator/allocator.hpp index a3eb7f05ed06..30e7807cf3be 100644 --- a/tt_metal/impl/allocator/allocator.hpp +++ b/tt_metal/impl/allocator/allocator.hpp @@ -76,6 +76,8 @@ class BankManager { void dump_blocks(std::ofstream& out) const; + std::vector> get_block_table() const; + void shrink_size(DeviceAddr shrink_size, bool bottom_up = true); void reset_size(); @@ -122,6 +124,9 @@ Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_t void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out); +std::vector> get_block_table( + const Allocator& allocator, const BufferType& buffer_type); + std::optional lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id); DeviceAddr base_alloc( diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index cff629ac78e8..6fe7068b85ca 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -1433,6 +1433,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou return allocator::dump_memory_blocks(*allocator, buffer_type, out); } +std::vector> Device::get_block_table(const BufferType &buffer_type) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::get_block_table(*allocator, buffer_type); +} + const std::unordered_set &Device::get_allocated_buffers() const { const auto& allocator = this->get_initialized_allocator(); return allocator::get_allocated_buffers(*allocator); diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 88bfdeb4b917..1a8913791bee 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -147,6 +147,8 @@ class Device : public IDevice { void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const override; void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const override; + std::vector> get_block_table(const BufferType &buffer_type) const override; + // Set of logical ethernet core coordinates // core.x represents connectivity to one other chip, i.e. cores with all connect to same chip // core.y represents different channels along one diff --git a/tt_metal/include/tt_metal/device.hpp b/tt_metal/include/tt_metal/device.hpp index 3cb6a375706e..f7c815f77949 100644 --- a/tt_metal/include/tt_metal/device.hpp +++ b/tt_metal/include/tt_metal/device.hpp @@ -162,6 +162,9 @@ class IDevice { virtual void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const = 0; virtual void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const = 0; + virtual std::vector> get_block_table( + const BufferType& buffer_type) const = 0; + // Set of logical ethernet core coordinates // core.x represents connectivity to one other chip, i.e. cores with all connect to same chip // core.y represents different channels along one diff --git a/ttnn/cpp/pybind11/device.cpp b/ttnn/cpp/pybind11/device.cpp index 0c8f118dd04b..1b114be76dbd 100644 --- a/ttnn/cpp/pybind11/device.cpp +++ b/ttnn/cpp/pybind11/device.cpp @@ -108,6 +108,20 @@ void py_device_module_types(py::module& m_device) { py::class_(m_device, "SubDeviceId", "ID of a sub-device."); py::class_(m_device, "SubDeviceManagerId", "ID of a sub-device manager."); + + py::class_( + m_device, "MemoryView", "Class representing view of the memory (dram, l1, l1_small, trace) of a device.") + .def_readonly("num_banks", &tt::tt_metal::detail::MemoryView::num_banks) + .def_readonly("bytes_allocatable_per_bank", &tt::tt_metal::detail::MemoryView::bytes_allocatable_per_bank) + .def_readonly("bytes_allocated_per_bank", &tt::tt_metal::detail::MemoryView::bytes_allocated_per_bank) + .def_readonly("bytes_free_per_bank", &tt::tt_metal::detail::MemoryView::bytes_free_per_bank) + .def_readonly("total_bytes_allocatable", &tt::tt_metal::detail::MemoryView::total_bytes_allocatable) + .def_readonly("total_bytes_allocated", &tt::tt_metal::detail::MemoryView::total_bytes_allocated) + .def_readonly("total_bytes_free", &tt::tt_metal::detail::MemoryView::total_bytes_free) + .def_readonly( + "largest_contiguous_bytes_free_per_bank", + &tt::tt_metal::detail::MemoryView::largest_contiguous_bytes_free_per_bank) + .def_readonly("block_table", &tt::tt_metal::detail::MemoryView::block_table); } void device_module(py::module& m_device) { @@ -537,6 +551,22 @@ void device_module(py::module& m_device) { +------------------+----------------------------------+-----------------------+-------------+----------+ )doc"); + m_device.def( + "GetMemoryView", + &tt::tt_metal::detail::GetMemoryView, + py::arg().noconvert(), + py::arg().noconvert(), + R"doc( + Populates MemoryView for BufferType [dram, l1, l1 small, trace]. Used when storing to disk is not an option. + + +------------------+----------------------------------+-----------------------+-------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +==================+==================================+=======================+=============+==========+ + | device | Device to dump memory state for | ttnn.Device | | Yes | + | buffer_type | Type of buffer for memory view | ttnn.BufferType | | Yes | + +------------------+----------------------------------+-----------------------+-------------+----------+ + )doc"); + m_device.def( "synchronize_device", [](IDevice* device, const std::optional cq_id, const std::vector& sub_device_ids) { diff --git a/ttnn/cpp/pybind11/tensor.cpp b/ttnn/cpp/pybind11/tensor.cpp index df04324818d9..43d207367837 100644 --- a/ttnn/cpp/pybind11/tensor.cpp +++ b/ttnn/cpp/pybind11/tensor.cpp @@ -80,7 +80,8 @@ void tensor_mem_config_module_types(py::module& m_tensor) { py::enum_(m_tensor, "BufferType") .value("DRAM", BufferType::DRAM) .value("L1", BufferType::L1) - .value("L1_SMALL", BufferType::L1_SMALL); + .value("L1_SMALL", BufferType::L1_SMALL) + .value("TRACE", BufferType::TRACE); tt_serializable_class(m_tensor, "CoreCoord", R"doc( Class defining core coordinate diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py index 5c9815652577..91e55d35c29e 100644 --- a/ttnn/ttnn/__init__.py +++ b/ttnn/ttnn/__init__.py @@ -174,6 +174,7 @@ def manage_config(name, value): manage_device, synchronize_device, dump_device_memory_state, + get_memory_view, GetPCIeDeviceID, GetNumPCIeDevices, GetNumAvailableDevices, diff --git a/ttnn/ttnn/device.py b/ttnn/ttnn/device.py index 6cbfaa85ead7..09aa5a22929a 100644 --- a/ttnn/ttnn/device.py +++ b/ttnn/ttnn/device.py @@ -134,6 +134,10 @@ def dump_device_memory_state(device, prefix=""): ttnn._ttnn.device.DumpDeviceMemoryState(device, prefix) +def get_memory_view(device, buffer_type): + return ttnn._ttnn.device.GetMemoryView(device, buffer_type) + + def is_wormhole_b0(device=None): if device is not None: return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0