From d8bbeb0320d36c6103841928140088d1bdc8de7a Mon Sep 17 00:00:00 2001
From: Tapasvi Patel <tpatel@tenstorrent.com>
Date: Tue, 14 Jan 2025 23:14:09 +0000
Subject: [PATCH] #1683: Updated memory collection and report of dram and l1
 using new metal APIs

---
 docs/src/ttrt.md                              |   7 +-
 runtime/include/tt/runtime/detail/ttmetal.h   |   3 +
 runtime/include/tt/runtime/detail/ttnn.h      |   3 +
 runtime/include/tt/runtime/runtime.h          |  13 +
 runtime/include/tt/runtime/types.h            |  23 ++
 runtime/lib/runtime.cpp                       |  17 ++
 runtime/lib/ttmetal/runtime.cpp               |  43 +++
 runtime/lib/ttnn/runtime.cpp                  |  42 +++
 runtime/tools/python/ttrt/common/callback.py  | 245 +++++++-----------
 runtime/tools/python/ttrt/common/run.py       |  64 +----
 runtime/tools/python/ttrt/runtime/__init__.py |   1 +
 runtime/tools/python/ttrt/runtime/module.cpp  |  20 +-
 12 files changed, 262 insertions(+), 219 deletions(-)

diff --git a/docs/src/ttrt.md b/docs/src/ttrt.md
index 0bd59cb02..6dc589287 100644
--- a/docs/src/ttrt.md
+++ b/docs/src/ttrt.md
@@ -174,7 +174,11 @@ ttrt run out.ttnn --save-artifacts --artifact-dir /path/to/some/dir
 ttrt run out.ttnn --load-kernels-from-disk
 ttrt run out.ttnn --enable-async-ttnn
 ttrt run out.ttnn --result-file result.json
-ttrt run out.ttnn --golden
+ttrt run out.ttnn --disable-golden
+ttrt run out.ttnn --save-golden-tensors
+ttrt run out.ttnn --debugger
+ttrt run out.ttnn --memory --save-artifacts
+ttrt run out.ttnn --memory --check-memory-leak
 ```
 
 ### query
@@ -219,6 +223,7 @@ ttrt perf /dir/of/flatbuffers --loops 10 --host-only
 ttrt perf /dir/of/flatbuffers --log-file ttrt.log --host-only
 ttrt perf --save-artifacts --artifact-dir /path/to/some/dir
 ttrt perf out.ttnn --result-file result.json
+ttrt run out.ttnn --memory
 ```
 
 To use the Tracy GUI, run the following instructions on your macbook. You can upload your .tracy file into the GUI to view the profiled dumps.
diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
index aa7808bac..87e3c3604 100644
--- a/runtime/include/tt/runtime/detail/ttmetal.h
+++ b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -44,6 +44,9 @@ void deallocateBuffers(Device device);
 
 void dumpMemoryReport(Device device);
 
+std::unordered_map<tt::runtime::MemoryBufferType, tt::runtime::MemoryView>
+getMemoryView(Device device, int deviceID = 0);
+
 void wait(Event event);
 
 void wait(Tensor tensor);
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 081ef02fe..e9ed590b9 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -95,6 +95,9 @@ void deallocateBuffers(Device device);
 
 void dumpMemoryReport(Device device);
 
+std::unordered_map<tt::runtime::MemoryBufferType, tt::runtime::MemoryView>
+getMemoryView(Device device, int deviceID = 0);
+
 void wait(Event event);
 
 void wait(Tensor tensor);
diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h
index 7725e4b56..1a668918c 100644
--- a/runtime/include/tt/runtime/runtime.h
+++ b/runtime/include/tt/runtime/runtime.h
@@ -20,6 +20,19 @@ std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc();
 namespace detail {
 void deallocateBuffers(Device device);
 void dumpMemoryReport(Device device);
+
+/*
+This function get the memory view per device
+  {
+    "DRAM": MemoryView,
+    "L1": MemoryView,
+    "L1Small": MemoryView,
+    "Trace": MemoryView
+  }
+*/
+std::unordered_map<tt::runtime::MemoryBufferType, tt::runtime::MemoryView>
+getMemoryView(Device device, int deviceID = 0);
+
 } // namespace detail
 
 DeviceRuntime getCurrentRuntime();
diff --git a/runtime/include/tt/runtime/types.h b/runtime/include/tt/runtime/types.h
index cc2791e23..154501512 100644
--- a/runtime/include/tt/runtime/types.h
+++ b/runtime/include/tt/runtime/types.h
@@ -18,6 +18,20 @@
 #pragma clang diagnostic pop
 
 namespace tt::runtime {
+/*
+MemoryBlockTable is a list of memory blocks in the following format:
+[{"blockID": "0", "address": "0", "size": "0", "prevID": "0", "nextID": "0",
+"allocated": true}] address: bytes size: bytes
+*/
+using MemoryBlockTable =
+    std::vector<std::unordered_map<std::string, std::string>>;
+
+enum class MemoryBufferType {
+  DRAM,
+  L1,
+  L1_SMALL,
+  TRACE,
+};
 
 enum class DeviceRuntime {
   Disabled,
@@ -146,6 +160,15 @@ struct OpContext : public detail::RuntimeCheckedObjectImpl {
   using detail::RuntimeCheckedObjectImpl::RuntimeCheckedObjectImpl;
 };
 
+struct MemoryView {
+  std::uint64_t numBanks = 0;
+  size_t totalBytesPerBank = 0;
+  size_t totalBytesAllocatedPerBank = 0;
+  size_t totalBytesFreePerBank = 0;
+  size_t largestContiguousBytesFreePerBank = 0;
+  MemoryBlockTable blockTable;
+};
+
 } // namespace tt::runtime
 
 #endif
diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp
index b0ac1ee43..f4fa49f2a 100644
--- a/runtime/lib/runtime.cpp
+++ b/runtime/lib/runtime.cpp
@@ -60,6 +60,23 @@ void dumpMemoryReport(Device device) {
 
   LOG_FATAL("runtime is not enabled");
 }
+
+std::unordered_map<tt::runtime::MemoryBufferType, tt::runtime::MemoryView>
+getMemoryView(Device device, int deviceID) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::getMemoryView(device, deviceID);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    return ::tt::runtime::ttmetal::getMemoryView(device, deviceID);
+  }
+#endif
+
+  LOG_FATAL("runtime is not enabled");
+}
 } // namespace detail
 
 DeviceRuntime getCurrentRuntime() {
diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp
index f2702b52a..ab20c3cc3 100644
--- a/runtime/lib/ttmetal/runtime.cpp
+++ b/runtime/lib/ttmetal/runtime.cpp
@@ -33,6 +33,19 @@ static Tensor createNullTensor() {
   return Tensor(nullptr, nullptr, DeviceRuntime::TTMetal);
 }
 
+static tt::runtime::MemoryView
+createMemoryView(tt::tt_metal::detail::MemoryView const &memoryView) {
+  return tt::runtime::MemoryView{
+      .numBanks = memoryView.num_banks,
+      .totalBytesPerBank = memoryView.total_bytes_per_bank,
+      .totalBytesAllocatedPerBank = memoryView.total_bytes_allocated_per_bank,
+      .totalBytesFreePerBank = memoryView.total_bytes_free_per_bank,
+      .largestContiguousBytesFreePerBank =
+          memoryView.largest_contiguous_bytes_free_per_bank,
+      .blockTable = memoryView.block_table,
+  };
+}
+
 Tensor createTensor(std::shared_ptr<void> data,
                     std::vector<std::uint32_t> const &shape,
                     std::vector<std::uint32_t> const &stride,
@@ -116,6 +129,36 @@ void dumpMemoryReport(Device deviceHandle) {
   }
 }
 
+std::unordered_map<tt::runtime::MemoryBufferType, tt::runtime::MemoryView>
+getMemoryView(Device deviceHandle, int deviceID) {
+  std::unordered_map<tt::runtime::MemoryBufferType, tt::runtime::MemoryView>
+      memoryMap;
+  ::tt::tt_metal::distributed::MeshDevice &meshDevice =
+      deviceHandle.as<::tt::tt_metal::distributed::MeshDevice>(
+          DeviceRuntime::TTMetal);
+
+  auto device = meshDevice.get_device(deviceID);
+
+  auto dramMemoryView = ::tt::tt_metal::detail::GetMemoryView(
+      device, tt::tt_metal::BufferType::DRAM);
+  auto l1MemoryView = ::tt::tt_metal::detail::GetMemoryView(
+      device, tt::tt_metal::BufferType::L1);
+  auto l1SmallMemoryView = ::tt::tt_metal::detail::GetMemoryView(
+      device, tt::tt_metal::BufferType::L1_SMALL);
+  auto traceMemoryView = ::tt::tt_metal::detail::GetMemoryView(
+      device, tt::tt_metal::BufferType::TRACE);
+
+  memoryMap[tt::runtime::MemoryBufferType::DRAM] =
+      createMemoryView(dramMemoryView);
+  memoryMap[tt::runtime::MemoryBufferType::L1] = createMemoryView(l1MemoryView);
+  memoryMap[tt::runtime::MemoryBufferType::L1_SMALL] =
+      createMemoryView(l1SmallMemoryView);
+  memoryMap[tt::runtime::MemoryBufferType::TRACE] =
+      createMemoryView(traceMemoryView);
+
+  return memoryMap;
+}
+
 void wait(Event event) {
   Events events = event.as<Events>(DeviceRuntime::TTMetal);
   for (auto e : events) {
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index 72378f881..593433581 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -94,6 +94,19 @@ static DeviceVariant getTargetDevice(::ttnn::MeshDevice &meshDevice) {
   return std::ref(meshDevice);
 }
 
+static tt::runtime::MemoryView
+createMemoryView(tt::tt_metal::detail::MemoryView const &memoryView) {
+  return tt::runtime::MemoryView{
+      .numBanks = memoryView.num_banks,
+      .totalBytesPerBank = memoryView.total_bytes_per_bank,
+      .totalBytesAllocatedPerBank = memoryView.total_bytes_allocated_per_bank,
+      .totalBytesFreePerBank = memoryView.total_bytes_free_per_bank,
+      .largestContiguousBytesFreePerBank =
+          memoryView.largest_contiguous_bytes_free_per_bank,
+      .blockTable = memoryView.block_table,
+  };
+}
+
 static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
   bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier(
       binary.handle.get());
@@ -225,6 +238,35 @@ void dumpMemoryReport(Device deviceHandle) {
   }
 }
 
+std::unordered_map<tt::runtime::MemoryBufferType, tt::runtime::MemoryView>
+getMemoryView(Device deviceHandle, int deviceID) {
+  std::unordered_map<tt::runtime::MemoryBufferType, tt::runtime::MemoryView>
+      memoryMap;
+  ::ttnn::MeshDevice &meshDevice =
+      deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
+
+  auto device = meshDevice.get_device(deviceID);
+
+  auto dramMemoryView = ::tt::tt_metal::detail::GetMemoryView(
+      device, tt::tt_metal::BufferType::DRAM);
+  auto l1MemoryView = ::tt::tt_metal::detail::GetMemoryView(
+      device, tt::tt_metal::BufferType::L1);
+  auto l1SmallMemoryView = ::tt::tt_metal::detail::GetMemoryView(
+      device, tt::tt_metal::BufferType::L1_SMALL);
+  auto traceMemoryView = ::tt::tt_metal::detail::GetMemoryView(
+      device, tt::tt_metal::BufferType::TRACE);
+
+  memoryMap[tt::runtime::MemoryBufferType::DRAM] =
+      createMemoryView(dramMemoryView);
+  memoryMap[tt::runtime::MemoryBufferType::L1] = createMemoryView(l1MemoryView);
+  memoryMap[tt::runtime::MemoryBufferType::L1_SMALL] =
+      createMemoryView(l1SmallMemoryView);
+  memoryMap[tt::runtime::MemoryBufferType::TRACE] =
+      createMemoryView(traceMemoryView);
+
+  return memoryMap;
+}
+
 void wait(Event event) {
   // Nothing to do for ttnn runtime
   LOG_ASSERT(event.matchesRuntime(DeviceRuntime::TTNN));
diff --git a/runtime/tools/python/ttrt/common/callback.py b/runtime/tools/python/ttrt/common/callback.py
index 56223224f..93a9af267 100644
--- a/runtime/tools/python/ttrt/common/callback.py
+++ b/runtime/tools/python/ttrt/common/callback.py
@@ -62,6 +62,49 @@ def save_memory_report(self, memory_report_path):
 
         self.logging.debug(f"Saved memory report to={memory_report_path}")
 
+    def check_pcc(self):
+        for loc, golden_data in self.golden_report.items():
+            if golden_data["actual_pcc"] < golden_data["expected_pcc"]:
+                raise Exception(
+                    f"Failed: golden comparison failed, actual_pcc={golden_data['actual_pcc']} < expected_pcc={golden_data['expected_pcc']}"
+                )
+
+    def check_memory_leak(self):
+        num_items = 0
+        for key, value in self.memory_report.items():
+            num_items += 1
+
+        if num_items == 0:
+            self.logging.warning(f"No memory data found")
+        else:
+            # query initial memory usage
+            dram_initial_size_per_device = self.memory_report[0]["dram"]
+            l1_initial_size_per_device = self.memory_report[0]["l1"]
+
+            # query final memory usage and ensure no memory leaks
+            dram_final_size_per_device = self.memory_report[num_items - 1]["dram"]
+            l1_final_size_per_device = self.memory_report[num_items - 1]["l1"]
+
+            for key, value in dram_initial_size_per_device.items():
+                dram_initial_size = value["total_bytes_allocated_per_bank"]
+                dram_final_size = dram_final_size_per_device[key][
+                    "total_bytes_allocated_per_bank"
+                ]
+
+                if dram_final_size > dram_initial_size:
+                    raise Exception(f"Memory leak detected in DRAM for device={key}")
+
+            for key, value in l1_initial_size_per_device.items():
+                l1_initial_size = value["total_bytes_allocated_per_bank"]
+                l1_final_size = l1_final_size_per_device[key][
+                    "total_bytes_allocated_per_bank"
+                ]
+
+                if l1_final_size > l1_initial_size:
+                    raise Exception(
+                        f"Memory leak detected in L1 cache for device={key}"
+                    )
+
 
 """
 -----------------------GOLDEN CALLBACK-----------------------
@@ -234,151 +277,20 @@ def golden(callback_runtime_config, binary, program_context, op_context):
 """
 
 
-def add_key(dram_memory_usage, l1_memory_usage, current_section, key, value):
-    if current_section == "DRAM":
-        dram_memory_usage[key] = value
-    elif current_section == "L1":
-        l1_memory_usage[key] = value
-
-
-def parse_detailed_memory_usage_file(dram_memory_usage, l1_memory_usage, file_path):
-    current_section = None
-
-    with open(file_path, "r") as file:
-        reader = csv.reader(file)
-        blocks = []
-
-        for row in reader:
-            if not any(row):
-                continue
-
-            if row[1].strip() == "DRAM":
-                current_section = "DRAM"
-            elif row[1].strip() == "L1":
-                current_section = "L1"
-            elif "Total" in row[1]:
-                if row[1].strip() == "Total allocatable (B):":
-                    add_key(
-                        dram_memory_usage,
-                        l1_memory_usage,
-                        current_section,
-                        "total_allocatable (bytes) : total_allocatable/bank * num_banks",
-                        row[2].strip(),
-                    )
-                elif row[1].strip() == "Total allocated (B):":
-                    add_key(
-                        dram_memory_usage,
-                        l1_memory_usage,
-                        current_section,
-                        "total_allocated (bytes) : total_allocated/bank * num_banks",
-                        row[2].strip(),
-                    )
-                elif row[1].strip() == "Total free (B):":
-                    add_key(
-                        dram_memory_usage,
-                        l1_memory_usage,
-                        current_section,
-                        "total_free (bytes) : total_allocatable - total_allocated",
-                        row[2].strip(),
-                    )
-            elif "Blocks" in row[2]:
-                blocks = []
-            else:
-                block = {}
-                block["address (bytes)"] = row[3].strip()
-                block["size (bytes)"] = row[4].strip()
-                block["allocated (y/n)"] = row[5].strip()
-
-                blocks.append(block)
-                add_key(
-                    dram_memory_usage,
-                    l1_memory_usage,
-                    current_section,
-                    "blocks",
-                    blocks,
-                )
-
-
-def parse_memory_usage_summary_file(dram_memory_usage, l1_memory_usage, file_path):
-    with open(file_path, "r") as file:
-        reader = csv.reader(file)
-        current_section = "DRAM"
-
-        for row in reader:
-            if not any(row):
-                continue
-
-            if "Total Allocatable Size" in row[1]:
-                continue
-
-            add_key(
-                dram_memory_usage,
-                l1_memory_usage,
-                current_section,
-                "total_allocatable (bytes) : per bank",
-                row[1].strip(),
-            )
-            add_key(
-                dram_memory_usage,
-                l1_memory_usage,
-                current_section,
-                "total_allocated (bytes): per bank",
-                row[2].strip(),
-            )
-            add_key(
-                dram_memory_usage,
-                l1_memory_usage,
-                current_section,
-                "total_free (bytes) : per bank",
-                row[3].strip(),
-            )
-            add_key(
-                dram_memory_usage,
-                l1_memory_usage,
-                current_section,
-                "largest_free_block (bytes) : per bank",
-                row[4].strip(),
-            )
-
-            if current_section == "DRAM":
-                current_section = "L1"
-
+def create_memory_dictionary(memory_view):
+    memory_dict = {}
+    memory_dict["num_banks"] = memory_view.num_banks
+    memory_dict["total_bytes_per_bank"] = memory_view.total_bytes_per_bank
+    memory_dict[
+        "total_bytes_allocated_per_bank"
+    ] = memory_view.total_bytes_allocated_per_bank
+    memory_dict["total_bytes_free_per_bank"] = memory_view.total_bytes_free_per_bank
+    memory_dict[
+        "largest_contiguous_bytes_free_per_bank"
+    ] = memory_view.largest_contiguous_bytes_free_per_bank
+    memory_dict["block_table"] = memory_view.block_table
 
-def parse_l1_usage_summary_file(dram_memory_usage, l1_memory_usage, file_path):
-    with open(file_path, "r") as file:
-        reader = csv.reader(file)
-        dram_row = True
-
-        for index, row in enumerate(reader):
-            if index == 2:
-                add_key(
-                    dram_memory_usage,
-                    l1_memory_usage,
-                    "L1",
-                    "largest_contiguous_free_block (bytes) : per bank",
-                    row[1].strip(),
-                )
-
-
-def parse_memory_csv_files(
-    detailed_memory_usage_file_path,
-    memory_usage_summary_file_path,
-    l1_usage_summary_file_path,
-):
-    dram_memory_usage = {}
-    l1_memory_usage = {}
-
-    parse_detailed_memory_usage_file(
-        dram_memory_usage, l1_memory_usage, detailed_memory_usage_file_path
-    )
-    parse_memory_usage_summary_file(
-        dram_memory_usage, l1_memory_usage, memory_usage_summary_file_path
-    )
-    parse_l1_usage_summary_file(
-        dram_memory_usage, l1_memory_usage, l1_usage_summary_file_path
-    )
-
-    return dram_memory_usage, l1_memory_usage
+    return memory_dict
 
 
 def memory(callback_runtime_config, binary, program_context, op_context):
@@ -390,22 +302,43 @@ def memory(callback_runtime_config, binary, program_context, op_context):
     logging.debug("executing memory dump")
     loc = ttrt.runtime.get_op_loc_info(op_context)
     debug_str = ttrt.runtime.get_op_debug_str(op_context)
+    device_id = 0
 
-    device.dump_memory_report()
-    memory_dump_dir_path = f"{get_ttrt_metal_home_path()}/generated/reports"
-
-    # read generated memory reports and store in condensed memory_report
-    dram_memory_usage, l1_memory_usage = parse_memory_csv_files(
-        f"{memory_dump_dir_path}/detailed_memory_usage.csv",
-        f"{memory_dump_dir_path}/memory_usage_summary.csv",
-        f"{memory_dump_dir_path}/l1_usage_summary.csv",
-    )
+    memory_views = device.get_memory_view(device_id)
+    dram_memory_view = memory_views[ttrt.runtime.MemoryBufferType.DRAM]
+    l1_memory_view = memory_views[ttrt.runtime.MemoryBufferType.L1]
+    l1_small_memory_view = memory_views[ttrt.runtime.MemoryBufferType.L1_SMALL]
+    trace_memory_view = memory_views[ttrt.runtime.MemoryBufferType.TRACE]
 
     op_memory_report = {}
     op_memory_report["loc"] = loc
     op_memory_report["debug_str"] = debug_str
-    op_memory_report["dram"] = dram_memory_usage
-    op_memory_report["l1"] = l1_memory_usage
+
+    dram_op_device_memory_report = {}
+    dram_op_device_memory_report["device_" + str(device_id)] = create_memory_dictionary(
+        dram_memory_view
+    )
+
+    l1_op_device_memory_report = {}
+    l1_op_device_memory_report["device_" + str(device_id)] = create_memory_dictionary(
+        l1_memory_view
+    )
+
+    l1_small_op_device_memory_report = {}
+    l1_small_op_device_memory_report[
+        "device_" + str(device_id)
+    ] = create_memory_dictionary(l1_small_memory_view)
+
+    trace_op_device_memory_report = {}
+    trace_op_device_memory_report[
+        "device_" + str(device_id)
+    ] = create_memory_dictionary(trace_memory_view)
+
+    op_memory_report["dram"] = dram_op_device_memory_report
+    op_memory_report["l1"] = l1_op_device_memory_report
+    op_memory_report["l1_small"] = l1_small_op_device_memory_report
+    op_memory_report["trace"] = trace_op_device_memory_report
+
     callback_runtime_config.memory_report[
         callback_runtime_config.callback_counter()
     ] = op_memory_report
@@ -418,6 +351,8 @@ def memory(callback_runtime_config, binary, program_context, op_context):
 
 def debugger(callback_runtime_config, binary, program_context, op_context):
     import pdb
+    import ttrt.runtime
+    import ttrt.binary
 
     device = callback_runtime_config.device
     logging = callback_runtime_config.logging
diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py
index b83c5d390..d5f468739 100644
--- a/runtime/tools/python/ttrt/common/run.py
+++ b/runtime/tools/python/ttrt/common/run.py
@@ -589,17 +589,7 @@ def _execute(binaries):
                                         f"{self.artifacts.get_binary_folder_path(bin)}/run/program_{program_index}/golden_results.json"
                                     )
 
-                                for (
-                                    loc,
-                                    golden_data,
-                                ) in callback_runtime_config.golden_report.items():
-                                    if (
-                                        golden_data["actual_pcc"]
-                                        < golden_data["expected_pcc"]
-                                    ):
-                                        raise Exception(
-                                            f"Failed: golden comparison failed for program={program_index}, actual_pcc={golden_data['actual_pcc']} < expected_pcc={golden_data['expected_pcc']}"
-                                        )
+                                callback_runtime_config.check_pcc()
 
                             if self["--memory"]:
                                 if self["--save-artifacts"]:
@@ -608,57 +598,7 @@ def _execute(binaries):
                                     )
 
                                 if self["--check-memory-leak"]:
-                                    num_items = 0
-                                    for (
-                                        key,
-                                        value,
-                                    ) in callback_runtime_config.memory_report.items():
-                                        num_items += 1
-
-                                    if num_items == 0:
-                                        self.logging.warning(f"No memory data found")
-                                    else:
-                                        # query initial memory usage
-                                        dram_initial_size = callback_runtime_config.memory_report[
-                                            0
-                                        ][
-                                            "dram"
-                                        ][
-                                            "total_allocated (bytes) : total_allocated/bank * num_banks"
-                                        ]
-                                        l1_initlal_size = callback_runtime_config.memory_report[
-                                            0
-                                        ][
-                                            "l1"
-                                        ][
-                                            "total_allocated (bytes) : total_allocated/bank * num_banks"
-                                        ]
-
-                                        # query final memory usage and ensure no memory leaks
-                                        dram_final_size = callback_runtime_config.memory_report[
-                                            num_items - 1
-                                        ][
-                                            "dram"
-                                        ][
-                                            "total_allocated (bytes) : total_allocated/bank * num_banks"
-                                        ]
-                                        l1_final_size = callback_runtime_config.memory_report[
-                                            num_items - 1
-                                        ][
-                                            "l1"
-                                        ][
-                                            "total_allocated (bytes) : total_allocated/bank * num_banks"
-                                        ]
-
-                                        if dram_final_size > dram_initial_size:
-                                            raise Exception(
-                                                "Memory leak detected in DRAM"
-                                            )
-
-                                        if l1_final_size > l1_initlal_size:
-                                            raise Exception(
-                                                "Memory leak detected in L1 cache"
-                                            )
+                                    callback_runtime_config.check_memory_leak()
 
                     except Exception as e:
                         test_result = {
diff --git a/runtime/tools/python/ttrt/runtime/__init__.py b/runtime/tools/python/ttrt/runtime/__init__.py
index 5ff953da5..b394dd901 100644
--- a/runtime/tools/python/ttrt/runtime/__init__.py
+++ b/runtime/tools/python/ttrt/runtime/__init__.py
@@ -7,6 +7,7 @@
         Device,
         Event,
         Tensor,
+        MemoryBufferType,
         DataType,
         DeviceRuntime,
         DebugEnv,
diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp
index f55b6b81e..ff1d9b8d0 100644
--- a/runtime/tools/python/ttrt/runtime/module.cpp
+++ b/runtime/tools/python/ttrt/runtime/module.cpp
@@ -21,14 +21,32 @@ namespace py = pybind11;
 PYBIND11_MODULE(_C, m) {
   m.doc() = "ttrt.runtime python extension for interacting with the "
             "Tenstorrent devices";
+  py::class_<tt::runtime::MemoryView>(m, "MemoryView")
+      .def_readonly("num_banks", &tt::runtime::MemoryView::numBanks)
+      .def_readonly("total_bytes_per_bank",
+                    &tt::runtime::MemoryView::totalBytesPerBank)
+      .def_readonly("total_bytes_allocated_per_bank",
+                    &tt::runtime::MemoryView::totalBytesAllocatedPerBank)
+      .def_readonly("total_bytes_free_per_bank",
+                    &tt::runtime::MemoryView::totalBytesFreePerBank)
+      .def_readonly("largest_contiguous_bytes_free_per_bank",
+                    &tt::runtime::MemoryView::largestContiguousBytesFreePerBank)
+      .def_readonly("block_table", &tt::runtime::MemoryView::blockTable);
   py::class_<tt::runtime::Device>(m, "Device")
       .def("deallocate_buffers", &tt::runtime::detail::deallocateBuffers)
-      .def("dump_memory_report", &tt::runtime::detail::dumpMemoryReport);
+      .def("dump_memory_report", &tt::runtime::detail::dumpMemoryReport)
+      .def("get_memory_view", &tt::runtime::detail::getMemoryView,
+           py::arg("device_id") = 0);
   py::class_<tt::runtime::Event>(m, "Event");
   py::class_<tt::runtime::Tensor>(m, "Tensor");
   py::class_<tt::runtime::Layout>(m, "Layout");
   py::class_<tt::runtime::OpContext>(m, "OpContext");
   py::class_<tt::runtime::CallbackContext>(m, "CallbackContext");
+  py::enum_<tt::runtime::MemoryBufferType>(m, "MemoryBufferType")
+      .value("DRAM", tt::runtime::MemoryBufferType::DRAM)
+      .value("L1", tt::runtime::MemoryBufferType::L1)
+      .value("L1_SMALL", tt::runtime::MemoryBufferType::L1_SMALL)
+      .value("TRACE", tt::runtime::MemoryBufferType::TRACE);
   py::enum_<::tt::target::DataType>(m, "DataType")
       .value("Float32", ::tt::target::DataType::Float32)
       .value("Float16", ::tt::target::DataType::Float16)