Skip to content

Commit

Permalink
#16367: Added support to enable dram and l1 memory collection without…
Browse files Browse the repository at this point in the history
… saving to disk
  • Loading branch information
tapspatel committed Jan 8, 2025
1 parent 76b29a0 commit 8905345
Show file tree
Hide file tree
Showing 15 changed files with 324 additions and 0 deletions.
12 changes: 12 additions & 0 deletions tech_reports/memory/allocator.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,18 @@ These reports can be enabled from C++ and Python.
// API to dump state of memory for a given device. Optional prefix will be prepended to the report.
DumpDeviceMemoryState(const Device *device, std::string prefix="");

// API to get dram memory view for a given device
GetDramMemoryView(const Device *device);

// API to get l1 memory view for a given device
GetL1MemoryView(const Device *device);

// API to get l1 small memory view for a given device
GetL1SmallMemoryView(const Device *device);

// API to get trace memory view for a given device
GetTraceMemoryView(const Device *device);

// APIs to enable/disable memory reports for each Program's compile
EnableMemoryReports();
DisableMemoryReports();
Expand Down
43 changes: 43 additions & 0 deletions tt_metal/detail/reports/memory_reporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,53 @@ void MemoryReporter::init_reports() {
write_headers(
this->program_memory_usage_summary_report_, this->program_l1_usage_summary_report_, /*add_program_id=*/true);
}

void DumpDeviceMemoryState(const Device* device, const std::string& prefix) {
MemoryReporter::inst().dump_memory_usage_state(device, std::move(prefix));
}

MemoryView get_memory_view(const Device* device, const BufferType& buffer_type) {
auto stats = device->get_memory_allocation_statistics(buffer_type);
auto num_banks_ = device->num_banks(buffer_type);

return MemoryView{
.num_banks = num_banks_,
.bytes_allocatable_per_bank = stats.total_allocatable_size_bytes,
.bytes_allocated_per_bank = stats.total_allocated_bytes,
.bytes_free_per_bank = stats.total_free_bytes,
.total_bytes_allocatable = stats.total_allocatable_size_bytes * num_banks_,
.total_bytes_allocated = stats.total_allocated_bytes * num_banks_,
.total_bytes_free = stats.total_free_bytes * num_banks_,
.largest_contiguous_bytes_free_per_bank = stats.largest_free_block_bytes,
.block_table = device->get_block_table(buffer_type)};
}

MemoryView MemoryReporter::get_dram_memory_view(const Device* device) const {
return get_memory_view(device, BufferType::DRAM);
}

MemoryView MemoryReporter::get_l1_memory_view(const Device* device) const {
return get_memory_view(device, BufferType::L1);
}

MemoryView MemoryReporter::get_l1_small_memory_view(const Device* device) const {
return get_memory_view(device, BufferType::L1_SMALL);
}

MemoryView MemoryReporter::get_trace_memory_view(const Device* device) const {
return get_memory_view(device, BufferType::TRACE);
}

MemoryView GetDramMemoryView(const Device* device) { return MemoryReporter::inst().get_dram_memory_view(device); }

MemoryView GetL1MemoryView(const Device* device) { return MemoryReporter::inst().get_l1_memory_view(device); }

MemoryView GetL1SmallMemoryView(const Device* device) {
return MemoryReporter::inst().get_l1_small_memory_view(device);
}

MemoryView GetTraceMemoryView(const Device* device) { return MemoryReporter::inst().get_trace_memory_view(device); }

bool MemoryReporter::enabled() { return is_enabled_; }

void MemoryReporter::toggle(bool state) { is_enabled_ = state; }
Expand Down
99 changes: 99 additions & 0 deletions tt_metal/detail/reports/memory_reporter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
#include <atomic>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>

namespace tt::tt_metal {
inline namespace v0 {

Expand All @@ -16,6 +19,7 @@ class Device;

} // namespace v0
namespace detail {
struct MemoryView;

/**
* Enable generation of reports for memory allocation statistics.
Expand Down Expand Up @@ -60,6 +64,97 @@ void DisableMemoryReports();
* */
void DumpDeviceMemoryState(const Device* device, const std::string& prefix = "");

/**
* Populates MemoryView for DRAM. Used when storing to disk is not an option.
*
* num_banks: total number of dram banks for given device
* bytes_allocatable_per_bank: total allocatable size per bank of dram in bytes
* bytes_allocated_per_bank: currently allocated size per bank of dram in bytes
* bytes_free_per_bank: total free size per bank of dram in bytes
* total_bytes_allocatable: total allocatable size of dram in bytes
* total_bytes_allocated: currently allocated size of dram in bytes
* total_bytes_free: total free size of dram in bytes
* largest_contiguous_bytes_free_per_bank: largest contiguous free block of dram in bytes
* block_table: list of all blocks in dram (blockID, address, size, prevID, nextID, allocated)
*
* std::vector<std::unordered_map<std::string, std::string>>: list of all blocks in dram (blockID, address, size,
* prevID, nextID, allocated)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------|
* | device | The device for which memory stats will be dumped. | const Device * | | True |
* */
MemoryView GetDramMemoryView(const Device* device);

/**
* Populates MemoryView for L1. Used when storing to disk is not an option.
*
* num_banks: total number of l1 banks for given device
* bytes_allocatable_per_bank: total allocatable size per bank of l1 in bytes
* bytes_allocated_per_bank: currently allocated size per bank of l1 in bytes
* bytes_free_per_bank: total free size per bank of l1 in bytes
* total_bytes_allocatable: total allocatable size of l1 in bytes
* total_bytes_allocated: currently allocated size of l1 in bytes
* total_bytes_free: total free size of l1 in bytes
* largest_contiguous_bytes_free_per_bank: largest contiguous free block of l1 in bytes
* block_table: list of all blocks in l1 (blockID, address, size, prevID, nextID, allocated)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------|
* | device | The device for which memory stats will be dumped. | const Device * | | True |
* */
MemoryView GetL1MemoryView(const Device* device);

/**
* Populates MemoryView for L1 small. Used when storing to disk is not an option.
*
* num_banks: total number of l1 small banks for given device
* bytes_allocatable_per_bank: total allocatable size per bank of l1 small in bytes
* bytes_allocated_per_bank: currently allocated size per bank of l1 small in bytes
* bytes_free_per_bank: total free size per bank of l1 small in bytes
* total_bytes_allocatable: total allocatable size of l1 small in bytes
* total_bytes_allocated: currently allocated size of l1 small in bytes
* total_bytes_free: total free size of l1 small in bytes
* largest_contiguous_bytes_free_per_bank: largest contiguous free block of l1 small in bytes
* block_table: list of all blocks in l1 small (blockID, address, size, prevID, nextID, allocated)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------|
* | device | The device for which memory stats will be dumped. | const Device * | | True |
* */
MemoryView GetL1SmallMemoryView(const Device* device);

/**
* Populates MemoryView for trace. Used when storing to disk is not an option.
*
* num_banks: total number of trace banks for given device
* bytes_allocatable_per_bank: total allocatable size per bank of trace in bytes
* bytes_allocated_per_bank: currently allocated size per bank of trace in bytes
* bytes_free_per_bank: total free size per bank of trace in bytes
* total_bytes_allocatable: total allocatable size of trace in bytes
* total_bytes_allocated: currently allocated size of trace in bytes
* total_bytes_free: total free size of trace in bytes
* largest_contiguous_bytes_free_per_bank: largest contiguous free block of trace in bytes
* block_table: list of all blocks in trace (blockID, address, size, prevID, nextID, allocated)
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------|---------------------------------------------------|-----------------|--------------------------------------------------------|----------|
* | device | The device for which memory stats will be dumped. | const Device * | | True |
* */
MemoryView GetTraceMemoryView(const Device* device);

struct MemoryView {
std::uint64_t num_banks;
size_t bytes_allocatable_per_bank;
size_t bytes_allocated_per_bank;
size_t bytes_free_per_bank;
size_t total_bytes_allocatable; // bytes_allocatable_per_bank * num_banks
size_t total_bytes_allocated; // bytes_allocated_per_bank * num_banks
size_t total_bytes_free; // bytes_free_per_bank * num_banks
size_t largest_contiguous_bytes_free_per_bank;
std::vector<std::unordered_map<std::string, std::string>> block_table;
};

class MemoryReporter {
public:
MemoryReporter& operator=(const MemoryReporter&) = delete;
Expand All @@ -70,6 +165,10 @@ class MemoryReporter {
void flush_program_memory_usage(uint64_t program_id, const Device* device);

void dump_memory_usage_state(const Device* device, const std::string& prefix = "") const;
MemoryView get_dram_memory_view(const Device* device) const;
MemoryView get_l1_memory_view(const Device* device) const;
MemoryView get_l1_small_memory_view(const Device* device) const;
MemoryView get_trace_memory_view(const Device* device) const;

static void toggle(bool state);
static MemoryReporter& inst();
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class Algorithm {

virtual void dump_blocks(std::ostream& out) const = 0;

virtual std::vector<std::unordered_map<std::string, std::string>> get_block_table() const = 0;

virtual void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) = 0;

virtual void reset_size() = 0;
Expand Down
18 changes: 18 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,24 @@ void FreeList::dump_blocks(std::ostream& out) const {
out << "\n";
}

std::vector<std::unordered_map<std::string, std::string>> FreeList::get_block_table() const {
std::vector<std::unordered_map<std::string, std::string>> blocks;
boost::local_shared_ptr<Block> curr_block = this->block_head_;

while (curr_block != nullptr) {
std::unordered_map<std::string, std::string> block_entry;

block_entry["address"] = std::to_string(curr_block->address + this->offset_bytes_); // bytes
block_entry["size"] = std::to_string(curr_block->size); // bytes
block_entry["allocated"] = this->is_allocated(curr_block) ? "yes" : "no";

blocks.push_back(block_entry);
curr_block = curr_block->next_block;
}

return blocks;
}

void FreeList::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (shrink_size == 0) {
return;
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class FreeList : public Algorithm {

void dump_blocks(std::ostream& out) const;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);

void reset_size();
Expand Down
22 changes: 22 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list_opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,28 @@ void FreeListOpt::dump_blocks(std::ostream& out) const {
}
}

std::vector<std::unordered_map<std::string, std::string>> FreeListOpt::get_block_table() const {
std::vector<std::unordered_map<std::string, std::string>> blocks;

for (size_t i = 0; i < block_address_.size(); i++) {
std::unordered_map<std::string, std::string> blockEntry;

if (!meta_block_is_allocated_[i]) {
continue;
}

blockEntry["blockID"] = std::to_string(i);
blockEntry["address"] = std::to_string(block_address_[i]); // bytes
blockEntry["size"] = std::to_string(block_size_[i]); // bytes
blockEntry["prevID"] = std::to_string(block_prev_block_[i]);
blockEntry["nextID"] = std::to_string(block_next_block_[i]);
blockEntry["allocated"] = block_is_allocated_[i] ? "yes" : "no";
blocks.push_back(blockEntry);
}

return blocks;
}

void FreeListOpt::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (shrink_size == 0) {
return;
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/allocator/algorithms/free_list_opt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class FreeListOpt : public Algorithm {

void dump_blocks(std::ostream& out) const override;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const override;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) override;

void reset_size() override;
Expand Down
22 changes: 22 additions & 0 deletions tt_metal/impl/allocator/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,15 @@ void BankManager::dump_blocks(std::ofstream& out) const {
}
}

std::vector<std::unordered_map<std::string, std::string>> BankManager::get_block_table() const {
if (this->allocator_) {
return this->allocator_->get_block_table();
}

log_warning("allocator is not initialized, cannot get block table for memory");
return {};
}

void BankManager::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
if (this->allocator_) {
this->allocator_->shrink_size(shrink_size, bottom_up);
Expand Down Expand Up @@ -375,6 +384,19 @@ void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_typ
}
}

std::vector<std::unordered_map<std::string, std::string>> get_block_table(
const Allocator& allocator, const BufferType& buffer_type) {
switch (buffer_type) {
case BufferType::DRAM: return allocator.dram_manager.get_block_table();
case BufferType::L1: return allocator.l1_manager.get_block_table();
case BufferType::L1_SMALL: return allocator.l1_small_manager.get_block_table();
case BufferType::TRACE: return allocator.trace_buffer_manager.get_block_table();
default: {
TT_THROW("Unsupported buffer type!");
}
}
}

std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id) {
// l1_manager always sits below l1_small_manager in the address space, so there is no need to check l1_small_manager
return allocator.l1_manager.lowest_occupied_address(bank_id);
Expand Down
5 changes: 5 additions & 0 deletions tt_metal/impl/allocator/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class BankManager {

void dump_blocks(std::ofstream& out) const;

std::vector<std::unordered_map<std::string, std::string>> get_block_table() const;

void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);
void reset_size();

Expand Down Expand Up @@ -122,6 +124,9 @@ Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_t

void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);

std::vector<std::unordered_map<std::string, std::string>> get_block_table(
const Allocator& allocator, const BufferType& buffer_type);

std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator& allocator, uint32_t bank_id);

DeviceAddr base_alloc(
Expand Down
5 changes: 5 additions & 0 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1426,6 +1426,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou
return allocator::dump_memory_blocks(*allocator, buffer_type, out);
}

std::vector<std::unordered_map<std::string, std::string>> Device::get_block_table(const BufferType &buffer_type) const {
const auto& allocator = this->get_initialized_allocator();
return allocator::get_block_table(*allocator, buffer_type);
}

const std::unordered_set<Buffer *> &Device::get_allocated_buffers() const {
const auto& allocator = this->get_initialized_allocator();
return allocator::get_allocated_buffers(*allocator);
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ class Device {
void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out) const;
void dump_memory_blocks(const BufferType &buffer_type, std::ofstream &out, SubDeviceId sub_device_id) const;

std::vector<std::unordered_map<std::string, std::string>> get_block_table(const BufferType &buffer_type) const;

// Set of logical ethernet core coordinates
// core.x represents connectivity to one other chip, i.e. cores with <x> all connect to same chip
// core.y represents different channels along one <x>
Expand Down
Loading

0 comments on commit 8905345

Please sign in to comment.