Skip to content

Commit

Permalink
Trying to clean up memory allocation tracking
Browse files Browse the repository at this point in the history
When tracking allocations on the host, everything seems to be working
correctly but on occasion, we see allocation amounts changing on the
stack in gdb on frontier. can't explain it yet. But some fixes are
included in this commit.
  • Loading branch information
khuck committed Feb 28, 2024
1 parent 806c489 commit c985b21
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 221 deletions.
4 changes: 3 additions & 1 deletion src/apex/apex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,7 @@ inline std::shared_ptr<task_wrapper> _new_task(
task_identifier * id,
const uint64_t task_id,
const std::shared_ptr<task_wrapper> parent_task, apex* instance) {
in_apex prevent_deadlocks;
APEX_UNUSED(instance);
std::shared_ptr<task_wrapper> tt_ptr = make_shared<task_wrapper>();
tt_ptr->task_id = id;
Expand Down Expand Up @@ -1670,7 +1671,7 @@ void finalize_plugins(void) {

std::string dump(bool reset, bool finalizing) {
in_apex prevent_deadlocks;
static size_t index{0};
static int index{0};
// if APEX is disabled, do nothing.
if (apex_options::disable() == true ||
(!finalizing && apex_options::use_final_output_only()))
Expand Down Expand Up @@ -1698,6 +1699,7 @@ std::string dump(bool reset, bool finalizing) {
controlMemoryWrapper(true);
}
if (_notify_listeners) {
//apex_get_leak_symbols();
dump_event_data data(instance->get_node_id(),
thread_instance::get_id(), reset);
for (unsigned int i = 0 ; i < instance->listeners.size() ; i++) {
Expand Down
3 changes: 2 additions & 1 deletion src/apex/apex_preload.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ int apex_preload_main(int argc, char** argv, char** envp) {
ret = main_real(argc, argv, envp);
} else {
apex::init("APEX Preload", 0, 1);
auto t = apex::new_task(__APEX_FUNCTION__);
const std::string timerName{__APEX_FUNCTION__};
auto t = apex::new_task(timerName);
apex::start(t);
ret = main_real(argc, argv, envp);
apex::stop(t);
Expand Down
4 changes: 2 additions & 2 deletions src/apex/hip_trace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ bool getBytesIfMalloc(uint32_t cid, const hip_api_data_t* data,
hostTotalAllocated.fetch_add(bytes, std::memory_order_relaxed);
value = (double)(hostTotalAllocated);
store_sync_counter_data(nullptr, "Total Bytes Occupied on Host", value, false);
apex::recordAlloc(bytes, ptr, apex::GPU_HOST_MALLOC);
apex::recordAlloc(bytes, ptr, APEX_GPU_HOST_MALLOC);
return true;
} else {
if (managed) {
Expand All @@ -587,7 +587,7 @@ bool getBytesIfMalloc(uint32_t cid, const hip_api_data_t* data,
totalAllocated.fetch_add(bytes, std::memory_order_relaxed);
value = (double)(totalAllocated);
store_sync_counter_data(nullptr, "Total Bytes Occupied on Device", value, false);
apex::recordAlloc(bytes, ptr, apex::GPU_DEVICE_MALLOC, false);
apex::recordAlloc(bytes, ptr, APEX_GPU_DEVICE_MALLOC, false);
}
// how much memory does SMI think we have?
apex::rsmi::monitor::instance().explicitMemCheck();
Expand Down
60 changes: 48 additions & 12 deletions src/apex/memory_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
namespace apex {

static const char * allocator_strings[] = {
"malloc", "calloc", "realloc", "gpu_host_malloc", "gpu_device_malloc"
"malloc", "calloc", "realloc", "gpu_host_malloc", "gpu_device_malloc", "free"
};

book_t& getBook() {
Expand Down Expand Up @@ -98,10 +98,10 @@ void disable_memory_wrapper() {
}

void printBacktrace() {
void *trace[32];
void *trace[64];
size_t size, i;
char **strings;
size = backtrace( trace, 32 );
size = backtrace( trace, 64 );
strings = backtrace_symbols( trace, size );
std::cerr << std::endl;
// skip the first frame, it is this handler
Expand All @@ -110,7 +110,8 @@ void printBacktrace() {
}
}

void recordAlloc(size_t bytes, void* ptr, allocator_t alloc, bool cpu) {
void recordAlloc(const size_t bytes, const void* ptr,
const apex_allocator_t alloc, const bool cpu) {
if (!recording()) return;
static book_t& book = getBook();
double value = (double)(bytes);
Expand All @@ -123,7 +124,7 @@ void recordAlloc(size_t bytes, void* ptr, allocator_t alloc, bool cpu) {
tmp.size = backtrace(tmp.backtrace.data(), tmp.backtrace.size());
book.mapMutex.lock();
//book.memoryMap[ptr] = value;
book.memoryMap.insert(std::pair<void*,record_t>(ptr, tmp));
book.memoryMap.insert(std::pair<const void*,record_t>(ptr, tmp));
book.mapMutex.unlock();
book.totalAllocated.fetch_add(bytes, std::memory_order_relaxed);
if (p == nullptr) {
Expand All @@ -140,7 +141,7 @@ void recordAlloc(size_t bytes, void* ptr, allocator_t alloc, bool cpu) {
if (cpu) sample_value("Memory: Total Bytes Occupied", value);
}

void recordFree(void* ptr, bool cpu) {
void recordFree(const void* ptr, const bool cpu) {
if (!recording()) return;
static book_t& book = getBook();
size_t bytes;
Expand Down Expand Up @@ -184,8 +185,8 @@ void recordMetric(std::string name, double value) {
}

// Comparator function to sort pairs descending, according to second value
bool cmp(std::pair<void*, record_t>& a,
std::pair<void*, record_t>& b)
bool cmp(std::pair<const void*, record_t>& a,
std::pair<const void*, record_t>& b)
{
return a.second.bytes > b.second.bytes;
}
Expand All @@ -197,6 +198,22 @@ bool cmp2(std::pair<std::string, size_t>& a,
return a.second > b.second;
}

void apex_get_leak_symbols() {
in_apex prevent_memory_tracking;
if (!apex_options::track_cpu_memory()) { return; }
if (!recording()) return;
static book_t& book = getBook();
for (auto& it : book.memoryMap) {
for(size_t i = 0; i < it.second.size; i++ ){
std::string* tmp2{lookup_address(((uintptr_t)it.second.backtrace[i]), true)};
it.second.symbols[i] = *tmp2;
//delete tmp2;
}
it.second.resolved = true;
}

}

void apex_report_leaks() {
if (!apex_options::track_gpu_memory() && !apex_options::track_cpu_memory()) {
return;
Expand All @@ -211,7 +228,7 @@ void apex_report_leaks() {
std::string outfile{ss.str()};
std::ofstream report (outfile);
// Declare vector of pairs
std::vector<std::pair<void*, record_t> > sorted;
std::vector<std::pair<const void*, record_t> > sorted;

if (book.saved_node_id == 0) {
std::cout << "APEX Memory Report: (see " << outfile << ")" << std::endl;
Expand All @@ -238,6 +255,7 @@ void apex_report_leaks() {
}
size_t actual_leaks{0};
// Print the sorted value
size_t actual_bytes{0};
for (auto& it : sorted) {
std::stringstream ss;
//if (it.second.bytes > 1000) {
Expand Down Expand Up @@ -266,13 +284,27 @@ void apex_report_leaks() {
if (tmp.find("pthread_once", 0) != std::string::npos) { skip = true; break; }
if (tmp.find("atexit", 0) != std::string::npos) { skip = true; break; }
if (tmp.find("apex_pthread_function", 0) != std::string::npos) { skip = true; break; }
if (tmp.find("hipFuncGetAttributes", 0) != std::string::npos) { skip = true; break; }
if (nameless) {
if (tmp.find("libcuda", 0) != std::string::npos) { skip = true; break; }
if (tmp.find("GOMP_parallel", 0) != std::string::npos) { skip = true; break; }
}
}
std::string* tmp2{lookup_address(((uintptr_t)it.second.backtrace[i]), true)};
ss << "\t" << *tmp2 << std::endl;
const std::string unknown{"{(unknown)}"};
if (it.second.resolved) {
if (it.second.symbols[i].find(unknown) == std::string::npos) {
ss << "\t" << it.second.symbols[i] << std::endl;
} else {
ss << "\t" << tmp << std::endl;
}
} else {
std::string* tmp2{lookup_address(((uintptr_t)it.second.backtrace[i]), true)};
if (tmp2->find(unknown) == std::string::npos) {
ss << "\t" << *tmp2 << std::endl;
} else {
ss << "\t" << tmp << std::endl;
}
}
}
if (skip) { continue; }

Expand All @@ -295,10 +327,14 @@ void apex_report_leaks() {
*/
report << ss.str();
actual_leaks++;
actual_bytes+=it.second.bytes;
}
report.close();
if (book.saved_node_id == 0) {
std::cout << "Reported " << actual_leaks << " 'actual' leaks.\nExpect false positives if memory was freed after exit." << std::endl;
std::cout << "Reported " << actual_leaks << " 'actual' leaks of "
<< actual_bytes
<< " bytes.\nExpect false positives if memory was freed after exit."
<< std::endl;
}
if (actual_leaks == 0) {
remove(outfile.c_str());
Expand Down
44 changes: 21 additions & 23 deletions src/apex/memory_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,33 @@
#pragma once
#include <apex.hpp>

typedef enum apex_allocator {
APEX_MALLOC = 0,
APEX_CALLOC,
APEX_REALLOC,
APEX_GPU_HOST_MALLOC,
APEX_GPU_DEVICE_MALLOC,
APEX_FREE
} apex_allocator_t;

namespace apex {

void apex_report_leaks();

typedef enum allocator {
MALLOC = 0,
CALLOC,
REALLOC,
GPU_HOST_MALLOC,
GPU_DEVICE_MALLOC
} allocator_t;
void apex_get_leak_symbols();

class record_t {
public:
size_t bytes;
task_identifier * id;
size_t tid;
allocator_t alloc;
record_t() : bytes(0), id(nullptr), tid(0), alloc(MALLOC), cpu(true) {}
record_t(size_t b, size_t t, allocator_t a, bool on_cpu) :
bytes(b), id(nullptr), tid(t), alloc(a), cpu(on_cpu) {}
apex_allocator_t alloc;
record_t() : bytes(0), id(nullptr), tid(0), alloc(APEX_MALLOC), resolved(false), cpu(true) {}
record_t(size_t b, size_t t, apex_allocator_t a, bool on_cpu) :
bytes(b), id(nullptr), tid(t), alloc(a), resolved(false), cpu(on_cpu) {}
//std::vector<uintptr_t> backtrace;
std::array<void*,32> backtrace;
std::array<void*,64> backtrace;
std::array<std::string,64> symbols;
bool resolved;
size_t size;
bool cpu;
};
Expand All @@ -45,25 +49,19 @@ class book_t {
public:
size_t saved_node_id;
std::atomic<size_t> totalAllocated{0};
std::unordered_map<void*,record_t> memoryMap;
std::unordered_map<const void*,record_t> memoryMap;
std::mutex mapMutex;
~book_t() {
apex_report_leaks();
}
};

class backtrace_record_t {
public:
size_t skip;
std::vector<uintptr_t>& _stack;
backtrace_record_t(size_t s, std::vector<uintptr_t>& _s) : skip(s), _stack(_s) {}
};

book_t& getBook(void);
void controlMemoryWrapper(bool enabled);
void printBacktrace(void);
void recordAlloc(size_t bytes, void* ptr, allocator_t alloc, bool cpu = true);
void recordFree(void* ptr, bool cpu = true);
void recordAlloc(const size_t bytes, const void* ptr,
const apex_allocator_t alloc, const bool cpu = true);
void recordFree(const void* ptr, const bool cpu = true);
void recordMetric(std::string name, double value);

}; // apex namespace
Expand Down
Loading

0 comments on commit c985b21

Please sign in to comment.