Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#16367: Added support to enable dram and l1 memory collection without saving to disk #16368

Merged
merged 1 commit into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tech_reports/memory/allocator.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ These reports can be enabled from C++ and Python.
// API to dump state of memory for a given device. Optional prefix will be prepended to the report.
DumpDeviceMemoryState(const Device *device, std::string prefix="");

// API to get dram memory view for a given device
GetMemoryView(const IDevice* device, const BufferType& buffer_type);

// APIs to enable/disable memory reports for each Program's compile
EnableMemoryReports();
DisableMemoryReports();
Expand Down
17 changes: 17 additions & 0 deletions tt_metal/detail/reports/memory_reporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,23 @@ void DumpDeviceMemoryState(const IDevice* device, const std::string& prefix) {
MemoryReporter::inst().dump_memory_usage_state(device, std::move(prefix));
}

MemoryView MemoryReporter::get_memory_view(const IDevice* device, const BufferType& buffer_type) const {
auto stats = device->get_memory_allocation_statistics(buffer_type);
auto num_banks_ = device->num_banks(buffer_type);

return MemoryView{
.num_banks = num_banks_,
.total_bytes_per_bank = stats.total_allocatable_size_bytes,
.total_bytes_allocated_per_bank = stats.total_allocated_bytes,
.total_bytes_free_per_bank = stats.total_free_bytes,
.largest_contiguous_bytes_free_per_bank = stats.largest_free_block_bytes,
.block_table = device->get_memory_block_table(buffer_type)};
}

MemoryView GetMemoryView(const IDevice* device, const BufferType& buffer_type) {
return MemoryReporter::inst().get_memory_view(device, buffer_type);
}

bool MemoryReporter::enabled() { return is_enabled_; }

void MemoryReporter::toggle(bool state) { is_enabled_ = state; }
Expand Down
34 changes: 34 additions & 0 deletions tt_metal/detail/reports/memory_reporter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
#include <atomic>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>

#include "tt_metal/impl/allocator/allocator.hpp"

namespace tt::tt_metal {
inline namespace v0 {

Expand All @@ -16,6 +21,7 @@ class IDevice;

} // namespace v0
namespace detail {
struct MemoryView;

/**
* Enable generation of reports for memory allocation statistics.
Expand Down Expand Up @@ -60,6 +66,32 @@ void DisableMemoryReports();
* */
void DumpDeviceMemoryState(const IDevice* device, const std::string& prefix = "");

/**
* Populates MemoryView for BufferType [dram, l1, l1 small, trace]. Used when storing to disk is not an option.
*
* num_banks: total number of BufferType banks for given device
* total_bytes_per_bank: total allocatable size per bank of BufferType in bytes
* total_bytes_allocated_per_bank: currently allocated size per bank of BufferType in bytes
* total_bytes_free_per_bank: total free size per bank of BufferType in bytes
* largest_contiguous_bytes_free_per_bank: largest contiguous free block of BufferType in bytes
* block_table: list of all blocks in BufferType (blockID, address, size, prevID, nextID, allocated)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------|
* | device | The device for which memory stats will be dumped. | const IDevice * | | True |
* | buffer_type | The type of buffer to populate the memory view. | const BufferType& | | True |
* */
MemoryView GetMemoryView(const IDevice* device, const BufferType& buffer_type);

struct MemoryView {
std::uint64_t num_banks = 0;
size_t total_bytes_per_bank = 0;
size_t total_bytes_allocated_per_bank = 0;
size_t total_bytes_free_per_bank = 0;
size_t largest_contiguous_bytes_free_per_bank = 0;
MemoryBlockTable block_table;
};

class MemoryReporter {
public:
MemoryReporter& operator=(const MemoryReporter&) = delete;
Expand All @@ -71,6 +103,8 @@ class MemoryReporter {

void dump_memory_usage_state(const IDevice* device, const std::string& prefix = "") const;

MemoryView get_memory_view(const IDevice* device, const BufferType& buffer_type) const;

static void toggle(bool state);
static MemoryReporter& inst();
static bool enabled();
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class Algorithm {

virtual void dump_blocks(std::ostream& out) const = 0;

virtual MemoryBlockTable get_memory_block_table() const = 0;

virtual void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) = 0;

virtual void reset_size() = 0;
Expand Down
18 changes: 18 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,24 @@ void FreeList::dump_blocks(std::ostream& out) const {
out << "\n";
}

MemoryBlockTable FreeList::get_memory_block_table() const {
MemoryBlockTable blocks;
boost::local_shared_ptr<Block> curr_block = this->block_head_;

while (curr_block != nullptr) {
std::unordered_map<std::string, std::string> block_entry;

block_entry["address"] = std::to_string(curr_block->address + this->offset_bytes_); // bytes
block_entry["size"] = std::to_string(curr_block->size); // bytes
block_entry["allocated"] = this->is_allocated(curr_block) ? "yes" : "no";

blocks.push_back(block_entry);
curr_block = curr_block->next_block;
}

return blocks;
}

void FreeList::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (shrink_size == 0) {
return;
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class FreeList : public Algorithm {

void dump_blocks(std::ostream& out) const;

MemoryBlockTable get_memory_block_table() const;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);

void reset_size();
Expand Down
22 changes: 22 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list_opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,28 @@ void FreeListOpt::dump_blocks(std::ostream& out) const {
}
}

MemoryBlockTable FreeListOpt::get_memory_block_table() const {
MemoryBlockTable blocks;

for (size_t i = 0; i < block_address_.size(); i++) {
std::unordered_map<std::string, std::string> block_entry;

if (!meta_block_is_allocated_[i]) {
continue;
}

block_entry["blockID"] = std::to_string(i);
block_entry["address"] = std::to_string(block_address_[i]); // bytes
block_entry["size"] = std::to_string(block_size_[i]); // bytes
block_entry["prevID"] = std::to_string(block_prev_block_[i]);
block_entry["nextID"] = std::to_string(block_next_block_[i]);
block_entry["allocated"] = block_is_allocated_[i] ? "yes" : "no";
blocks.push_back(block_entry);
}

return blocks;
}

void FreeListOpt::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (shrink_size == 0) {
return;
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list_opt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class FreeListOpt : public Algorithm {

void dump_blocks(std::ostream& out) const override;

MemoryBlockTable get_memory_block_table() const override;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) override;

void reset_size() override;
Expand Down
21 changes: 21 additions & 0 deletions tt_metal/impl/allocator/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,15 @@ void BankManager::dump_blocks(std::ofstream& out) const {
}
}

MemoryBlockTable BankManager::get_memory_block_table() const {
if (this->allocator_) {
return this->allocator_->get_memory_block_table();
}

log_warning("allocator is not initialized, cannot get block table for memory");
return {};
}

void BankManager::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (this->allocator_) {
this->allocator_->shrink_size(shrink_size, bottom_up);
Expand Down Expand Up @@ -375,6 +384,18 @@ void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_typ
}
}

MemoryBlockTable get_memory_block_table(const Allocator& allocator, const BufferType& buffer_type) {
switch (buffer_type) {
case BufferType::DRAM: return allocator.dram_manager.get_memory_block_table();
case BufferType::L1: return allocator.l1_manager.get_memory_block_table();
case BufferType::L1_SMALL: return allocator.l1_small_manager.get_memory_block_table();
case BufferType::TRACE: return allocator.trace_buffer_manager.get_memory_block_table();
default: {
TT_THROW("Unsupported buffer type!");
}
}
}

std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id) {
// l1_manager always sits below l1_small_manager in the address space, so there is no need to check l1_small_manager
return allocator.l1_manager.lowest_occupied_address(bank_id);
Expand Down
4 changes: 4 additions & 0 deletions tt_metal/impl/allocator/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class BankManager {

void dump_blocks(std::ofstream& out) const;

MemoryBlockTable get_memory_block_table() const;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);
void reset_size();

Expand Down Expand Up @@ -122,6 +124,8 @@ Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_t

void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);

MemoryBlockTable get_memory_block_table(const Allocator& allocator, const BufferType& buffer_type);

std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id);

DeviceAddr base_alloc(
Expand Down
7 changes: 7 additions & 0 deletions tt_metal/impl/allocator/allocator_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@
namespace tt::tt_metal {

// Fwd declares
/*
MemoryBlockTable is a list of memory blocks in the following format:
[{"blockID": "0", "address": "0", "size": "0", "prevID": "0", "nextID": "0", "allocated": true}]
address: bytes
size: bytes
*/
using MemoryBlockTable = std::vector<std::unordered_map<std::string, std::string>>;
struct Allocator;
namespace allocator {
class BankManager;
Expand Down
5 changes: 5 additions & 0 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1431,6 +1431,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou
return allocator::dump_memory_blocks(*allocator, buffer_type, out);
}

MemoryBlockTable Device::get_memory_block_table(const BufferType& buffer_type) const {
const auto& allocator = this->get_initialized_allocator();
return allocator::get_memory_block_table(*allocator, buffer_type);
}

const std::unordered_set<Buffer *> &Device::get_allocated_buffers() const {
const auto& allocator = this->get_initialized_allocator();
return allocator::get_allocated_buffers(*allocator);
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ class Device : public IDevice {
void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const override;
void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const override;

MemoryBlockTable get_memory_block_table(const BufferType &buffer_type) const override;

// Set of logical ethernet core coordinates
// core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
// core.y represents different channels along one <x>
Expand Down
9 changes: 9 additions & 0 deletions tt_metal/include/tt_metal/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@
namespace tt {

namespace tt_metal {
/*
MemoryBlockTable is a list of memory blocks in the following format:
[{"blockID": "0", "address": "0", "size": "0", "prevID": "0", "nextID": "0", "allocated": true}]
address: bytes
size: bytes
*/
using MemoryBlockTable = std::vector<std::unordered_map<std::string, std::string>>;
enum class BufferType;

inline namespace v0 {
Expand Down Expand Up @@ -162,6 +169,8 @@ class IDevice {
virtual void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const = 0;
virtual void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const = 0;

virtual MemoryBlockTable get_memory_block_table(const BufferType& buffer_type) const = 0;

// Set of logical ethernet core coordinates
// core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
// core.y represents different channels along one <x>
Expand Down
28 changes: 28 additions & 0 deletions ttnn/cpp/pybind11/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,18 @@ void py_device_module_types(py::module& m_device) {
py::class_<SubDeviceId>(m_device, "SubDeviceId", "ID of a sub-device.");

py::class_<SubDeviceManagerId>(m_device, "SubDeviceManagerId", "ID of a sub-device manager.");

py::class_<tt::tt_metal::detail::MemoryView>(
m_device, "MemoryView", "Class representing view of the memory (dram, l1, l1_small, trace) of a device.")
.def_readonly("num_banks", &tt::tt_metal::detail::MemoryView::num_banks)
.def_readonly("total_bytes_per_bank", &tt::tt_metal::detail::MemoryView::total_bytes_per_bank)
.def_readonly(
"total_bytes_allocated_per_bank", &tt::tt_metal::detail::MemoryView::total_bytes_allocated_per_bank)
.def_readonly("total_bytes_free_per_bank", &tt::tt_metal::detail::MemoryView::total_bytes_free_per_bank)
.def_readonly(
"largest_contiguous_bytes_free_per_bank",
&tt::tt_metal::detail::MemoryView::largest_contiguous_bytes_free_per_bank)
.def_readonly("block_table", &tt::tt_metal::detail::MemoryView::block_table);
}

void device_module(py::module& m_device) {
Expand Down Expand Up @@ -537,6 +549,22 @@ void device_module(py::module& m_device) {
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");

m_device.def(
"GetMemoryView",
&tt::tt_metal::detail::GetMemoryView,
py::arg().noconvert(),
py::arg().noconvert(),
R"doc(
Populates MemoryView for BufferType [dram, l1, l1 small, trace]. Used when storing to disk is not an option.

+------------------+----------------------------------+-----------------------+-------------+----------+
| Argument | Description | Data type | Valid range | Required |
+==================+==================================+=======================+=============+==========+
| device | Device to dump memory state for | ttnn.Device | | Yes |
| buffer_type | Type of buffer for memory view | ttnn.BufferType | | Yes |
+------------------+----------------------------------+-----------------------+-------------+----------+
)doc");

m_device.def(
"synchronize_device",
[](IDevice* device, const std::optional<uint8_t> cq_id, const std::vector<SubDeviceId>& sub_device_ids) {
Expand Down
3 changes: 2 additions & 1 deletion ttnn/cpp/pybind11/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ void tensor_mem_config_module_types(py::module& m_tensor) {
py::enum_<tt::tt_metal::BufferType>(m_tensor, "BufferType")
.value("DRAM", BufferType::DRAM)
.value("L1", BufferType::L1)
.value("L1_SMALL", BufferType::L1_SMALL);
.value("L1_SMALL", BufferType::L1_SMALL)
.value("TRACE", BufferType::TRACE);

tt_serializable_class<tt::tt_metal::CoreCoord>(m_tensor, "CoreCoord", R"doc(
Class defining core coordinate
Expand Down
1 change: 1 addition & 0 deletions ttnn/ttnn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def manage_config(name, value):
manage_device,
synchronize_device,
dump_device_memory_state,
get_memory_view,
GetPCIeDeviceID,
GetNumPCIeDevices,
GetNumAvailableDevices,
Expand Down
4 changes: 4 additions & 0 deletions ttnn/ttnn/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ def dump_device_memory_state(device, prefix=""):
ttnn._ttnn.device.DumpDeviceMemoryState(device, prefix)


def get_memory_view(device, buffer_type):
return ttnn._ttnn.device.GetMemoryView(device, buffer_type)


def is_wormhole_b0(device=None):
if device is not None:
return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0
Expand Down
Loading