Skip to content

Commit

Permalink
#16367: Added support to enable dram and l1 memory collection without…
Browse files Browse the repository at this point in the history
… saving to disk
  • Loading branch information
tapspatel committed Jan 9, 2025
1 parent 21394d0 commit affbc66
Show file tree
Hide file tree
Showing 17 changed files with 183 additions and 1 deletion.
3 changes: 3 additions & 0 deletions tech_reports/memory/allocator.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ These reports can be enabled from C++ and Python.
// API to dump state of memory for a given device. Optional prefix will be prepended to the report.
DumpDeviceMemoryState(const Device *device, std::string prefix="");

// API to get dram memory view for a given device
GetMemoryView(const IDevice* device, const BufferType& buffer_type);

// APIs to enable/disable memory reports for each Program's compile
EnableMemoryReports();
DisableMemoryReports();
Expand Down
20 changes: 20 additions & 0 deletions tt_metal/detail/reports/memory_reporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,26 @@ void DumpDeviceMemoryState(const IDevice* device, const std::string& prefix) {
MemoryReporter::inst().dump_memory_usage_state(device, std::move(prefix));
}

MemoryView MemoryReporter::get_memory_view(const IDevice* device, const BufferType& buffer_type) const {
auto stats = device->get_memory_allocation_statistics(buffer_type);
auto num_banks_ = device->num_banks(buffer_type);

return MemoryView{
.num_banks = num_banks_,
.bytes_allocatable_per_bank = stats.total_allocatable_size_bytes,
.bytes_allocated_per_bank = stats.total_allocated_bytes,
.bytes_free_per_bank = stats.total_free_bytes,
.total_bytes_allocatable = stats.total_allocatable_size_bytes * num_banks_,
.total_bytes_allocated = stats.total_allocated_bytes * num_banks_,
.total_bytes_free = stats.total_free_bytes * num_banks_,
.largest_contiguous_bytes_free_per_bank = stats.largest_free_block_bytes,
.block_table = device->get_block_table(buffer_type)};
}

MemoryView GetMemoryView(const IDevice* device, const BufferType& buffer_type) {
return MemoryReporter::inst().get_memory_view(device, buffer_type);
}

bool MemoryReporter::enabled() { return is_enabled_; }

void MemoryReporter::toggle(bool state) { is_enabled_ = state; }
Expand Down
40 changes: 40 additions & 0 deletions tt_metal/detail/reports/memory_reporter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
#include <atomic>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>

#include "tt_metal/impl/allocator/allocator.hpp"

namespace tt::tt_metal {
inline namespace v0 {

Expand All @@ -16,6 +21,7 @@ class IDevice;

} // namespace v0
namespace detail {
struct MemoryView;

/**
* Enable generation of reports for memory allocation statistics.
Expand Down Expand Up @@ -60,6 +66,38 @@ void DisableMemoryReports();
* */
void DumpDeviceMemoryState(const IDevice* device, const std::string& prefix = "");

/**
* Populates MemoryView for BufferType [dram, l1, l1 small, trace]. Used when storing to disk is not an option.
*
* num_banks: total number of BufferType banks for given device
* bytes_allocatable_per_bank: total allocatable size per bank of BufferType in bytes
* bytes_allocated_per_bank: currently allocated size per bank of BufferType in bytes
* bytes_free_per_bank: total free size per bank of BufferType in bytes
* total_bytes_allocatable: total allocatable size of BufferType in bytes
* total_bytes_allocated: currently allocated size of BufferType in bytes
* total_bytes_free: total free size of BufferType in bytes
* largest_contiguous_bytes_free_per_bank: largest contiguous free block of BufferType in bytes
* block_table: list of all blocks in BufferType (blockID, address, size, prevID, nextID, allocated)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------|
* | device | The device for which memory stats will be dumped. | const IDevice * | | True |
* | buffer_type | The type of buffer to populate the memory view. | const BufferType& | | True |
* */
MemoryView GetMemoryView(const IDevice* device, const BufferType& buffer_type);

struct MemoryView {
std::uint64_t num_banks;
size_t bytes_allocatable_per_bank;
size_t bytes_allocated_per_bank;
size_t bytes_free_per_bank;
size_t total_bytes_allocatable; // bytes_allocatable_per_bank * num_banks
size_t total_bytes_allocated; // bytes_allocated_per_bank * num_banks
size_t total_bytes_free; // bytes_free_per_bank * num_banks
size_t largest_contiguous_bytes_free_per_bank;
std::vector<std::unordered_map<std::string, std::string>> block_table;
};

class MemoryReporter {
public:
MemoryReporter& operator=(const MemoryReporter&) = delete;
Expand All @@ -71,6 +109,8 @@ class MemoryReporter {

void dump_memory_usage_state(const IDevice* device, const std::string& prefix = "") const;

MemoryView get_memory_view(const IDevice* device, const BufferType& buffer_type) const;

static void toggle(bool state);
static MemoryReporter& inst();
static bool enabled();
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class Algorithm {

virtual void dump_blocks(std::ostream& out) const = 0;

virtual std::vector<std::unordered_map<std::string, std::string>> get_block_table() const = 0;

virtual void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) = 0;

virtual void reset_size() = 0;
Expand Down
18 changes: 18 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,24 @@ void FreeList::dump_blocks(std::ostream& out) const {
out << "\n";
}

std::vector<std::unordered_map<std::string, std::string>> FreeList::get_block_table() const {
std::vector<std::unordered_map<std::string, std::string>> blocks;
boost::local_shared_ptr<Block> curr_block = this->block_head_;

while (curr_block != nullptr) {
std::unordered_map<std::string, std::string> block_entry;

block_entry["address"] = std::to_string(curr_block->address + this->offset_bytes_); // bytes
block_entry["size"] = std::to_string(curr_block->size); // bytes
block_entry["allocated"] = this->is_allocated(curr_block) ? "yes" : "no";

blocks.push_back(block_entry);
curr_block = curr_block->next_block;
}

return blocks;
}

void FreeList::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (shrink_size == 0) {
return;
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class FreeList : public Algorithm {

void dump_blocks(std::ostream& out) const;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);

void reset_size();
Expand Down
22 changes: 22 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list_opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,28 @@ void FreeListOpt::dump_blocks(std::ostream& out) const {
}
}

std::vector<std::unordered_map<std::string, std::string>> FreeListOpt::get_block_table() const {
std::vector<std::unordered_map<std::string, std::string>> blocks;

for (size_t i = 0; i < block_address_.size(); i++) {
std::unordered_map<std::string, std::string> blockEntry;

if (!meta_block_is_allocated_[i]) {
continue;
}

blockEntry["blockID"] = std::to_string(i);
blockEntry["address"] = std::to_string(block_address_[i]); // bytes
blockEntry["size"] = std::to_string(block_size_[i]); // bytes
blockEntry["prevID"] = std::to_string(block_prev_block_[i]);
blockEntry["nextID"] = std::to_string(block_next_block_[i]);
blockEntry["allocated"] = block_is_allocated_[i] ? "yes" : "no";
blocks.push_back(blockEntry);
}

return blocks;
}

void FreeListOpt::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (shrink_size == 0) {
return;
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list_opt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class FreeListOpt : public Algorithm {

void dump_blocks(std::ostream& out) const override;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const override;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) override;

void reset_size() override;
Expand Down
22 changes: 22 additions & 0 deletions tt_metal/impl/allocator/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,15 @@ void BankManager::dump_blocks(std::ofstream& out) const {
}
}

std::vector<std::unordered_map<std::string, std::string>> BankManager::get_block_table() const {
if (this->allocator_) {
return this->allocator_->get_block_table();
}

log_warning("allocator is not initialized, cannot get block table for memory");
return {};
}

void BankManager::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (this->allocator_) {
this->allocator_->shrink_size(shrink_size, bottom_up);
Expand Down Expand Up @@ -375,6 +384,19 @@ void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_typ
}
}

std::vector<std::unordered_map<std::string, std::string>> get_block_table(
const Allocator& allocator, const BufferType& buffer_type) {
switch (buffer_type) {
case BufferType::DRAM: return allocator.dram_manager.get_block_table();
case BufferType::L1: return allocator.l1_manager.get_block_table();
case BufferType::L1_SMALL: return allocator.l1_small_manager.get_block_table();
case BufferType::TRACE: return allocator.trace_buffer_manager.get_block_table();
default: {
TT_THROW("Unsupported buffer type!");
}
}
}

std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id) {
// l1_manager always sits below l1_small_manager in the address space, so there is no need to check l1_small_manager
return allocator.l1_manager.lowest_occupied_address(bank_id);
Expand Down
5 changes: 5 additions & 0 deletions tt_metal/impl/allocator/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class BankManager {

void dump_blocks(std::ofstream& out) const;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);
void reset_size();

Expand Down Expand Up @@ -122,6 +124,9 @@ Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_t

void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);

std::vector<std::unordered_map<std::string, std::string>> get_block_table(
const Allocator& allocator, const BufferType& buffer_type);

std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id);

DeviceAddr base_alloc(
Expand Down
5 changes: 5 additions & 0 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1433,6 +1433,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou
return allocator::dump_memory_blocks(*allocator, buffer_type, out);
}

std::vector<std::unordered_map<std::string, std::string>> Device::get_block_table(const BufferType &buffer_type) const {
const auto& allocator = this->get_initialized_allocator();
return allocator::get_block_table(*allocator, buffer_type);
}

const std::unordered_set<Buffer *> &Device::get_allocated_buffers() const {
const auto& allocator = this->get_initialized_allocator();
return allocator::get_allocated_buffers(*allocator);
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ class Device : public IDevice {
void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const override;
void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const override;

std::vector<std::unordered_map<std::string, std::string>> get_block_table(const BufferType &buffer_type) const override;

// Set of logical ethernet core coordinates
// core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
// core.y represents different channels along one <x>
Expand Down
3 changes: 3 additions & 0 deletions tt_metal/include/tt_metal/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ class IDevice {
virtual void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const = 0;
virtual void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const = 0;

virtual std::vector<std::unordered_map<std::string, std::string>> get_block_table(
const BufferType& buffer_type) const = 0;

// Set of logical ethernet core coordinates
// core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
// core.y represents different channels along one <x>
Expand Down
30 changes: 30 additions & 0 deletions ttnn/cpp/pybind11/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,20 @@ void py_device_module_types(py::module& m_device) {
py::class_<SubDeviceId>(m_device, "SubDeviceId", "ID of a sub-device.");

py::class_<SubDeviceManagerId>(m_device, "SubDeviceManagerId", "ID of a sub-device manager.");

py::class_<tt::tt_metal::detail::MemoryView>(
m_device, "MemoryView", "Class representing view of the memory (dram, l1, l1_small, trace) of a device.")
.def_readonly("num_banks", &tt::tt_metal::detail::MemoryView::num_banks)
.def_readonly("bytes_allocatable_per_bank", &tt::tt_metal::detail::MemoryView::bytes_allocatable_per_bank)
.def_readonly("bytes_allocated_per_bank", &tt::tt_metal::detail::MemoryView::bytes_allocated_per_bank)
.def_readonly("bytes_free_per_bank", &tt::tt_metal::detail::MemoryView::bytes_free_per_bank)
.def_readonly("total_bytes_allocatable", &tt::tt_metal::detail::MemoryView::total_bytes_allocatable)
.def_readonly("total_bytes_allocated", &tt::tt_metal::detail::MemoryView::total_bytes_allocated)
.def_readonly("total_bytes_free", &tt::tt_metal::detail::MemoryView::total_bytes_free)
.def_readonly(
"largest_contiguous_bytes_free_per_bank",
&tt::tt_metal::detail::MemoryView::largest_contiguous_bytes_free_per_bank)
.def_readonly("block_table", &tt::tt_metal::detail::MemoryView::block_table);
}

void device_module(py::module& m_device) {
Expand Down Expand Up @@ -537,6 +551,22 @@ void device_module(py::module& m_device) {
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");

m_device.def(
"GetMemoryView",
&tt::tt_metal::detail::GetMemoryView,
py::arg().noconvert(),
py::arg().noconvert(),
R"doc(
Populates MemoryView for BufferType [dram, l1, l1 small, trace]. Used when storing to disk is not an option.
+------------------+----------------------------------+-----------------------+-------------+----------+
| Argument | Description | Data type | Valid range | Required |
+==================+==================================+=======================+=============+==========+
| device | Device to dump memory state for | ttnn.Device | | Yes |
| buffer_type | Type of buffer for memory view | ttnn.BufferType | | Yes |
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");

m_device.def(
"synchronize_device",
[](IDevice* device, const std::optional<uint8_t> cq_id, const std::vector<SubDeviceId>& sub_device_ids) {
Expand Down
3 changes: 2 additions & 1 deletion ttnn/cpp/pybind11/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ void tensor_mem_config_module_types(py::module& m_tensor) {
py::enum_<tt::tt_metal::BufferType>(m_tensor, "BufferType")
.value("DRAM", BufferType::DRAM)
.value("L1", BufferType::L1)
.value("L1_SMALL", BufferType::L1_SMALL);
.value("L1_SMALL", BufferType::L1_SMALL)
.value("TRACE", BufferType::TRACE);

tt_serializable_class<tt::tt_metal::CoreCoord>(m_tensor, "CoreCoord", R"doc(
Class defining core coordinate
Expand Down
1 change: 1 addition & 0 deletions ttnn/ttnn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def manage_config(name, value):
manage_device,
synchronize_device,
dump_device_memory_state,
get_memory_view,
GetPCIeDeviceID,
GetNumPCIeDevices,
GetNumAvailableDevices,
Expand Down
4 changes: 4 additions & 0 deletions ttnn/ttnn/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ def dump_device_memory_state(device, prefix=""):
ttnn._ttnn.device.DumpDeviceMemoryState(device, prefix)


def get_memory_view(device, buffer_type):
return ttnn._ttnn.device.GetMemoryView(device, buffer_type)


def is_wormhole_b0(device=None):
if device is not None:
return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0
Expand Down

0 comments on commit affbc66

Please sign in to comment.