Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[BUGFIX] Improve compile/use of nvmlDeviceGetComputeRunningProcesses() #20887

Merged
merged 2 commits into from
Feb 11, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions src/profiler/storage_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <regex>
#include <unordered_map>
#include <vector>
#include <type_traits>
#include "./profiler.h"
#include "../common/utils.h"
#include "../common/cuda/utils.h"
Expand All @@ -45,6 +46,19 @@ GpuDeviceStorageProfiler* GpuDeviceStorageProfiler::Get() {
return gpu_dev_storage_profiler.get();
}

#if MXNET_USE_NVML
// Deduce the possibly versioned variant of nvmlProcessInfo_t* expected
// as the 3rd arg of nvmlDeviceGetComputeRunningProcesses().
template <typename F>
struct GetArgType;
template <typename R, typename T1, typename T2, typename T3>
struct GetArgType<R (*)(T1, T2, T3)> {
typedef T3 arg3_t;
};
using NvmlProcessInfoPtr = GetArgType<decltype(&nvmlDeviceGetComputeRunningProcesses)>::arg3_t;
using NvmlProcessInfo = std::remove_pointer_t<NvmlProcessInfoPtr>;
#endif

void GpuDeviceStorageProfiler::DumpProfile() const {
size_t current_pid = common::current_process_id();
std::ofstream fout((filename_prefix_ + "-pid_" + std::to_string(current_pid) + ".csv").c_str());
Expand Down Expand Up @@ -97,23 +111,17 @@ void GpuDeviceStorageProfiler::DumpProfile() const {
// If NVML has been enabled, add amend term to the GPU memory profile.
nvmlDevice_t nvml_device;

#if NVML_API_VERSION < 11
typedef std::vector<nvmlProcessInfo_t> ProcessInfoVector;
#else
typedef std::vector<nvmlProcessInfo_v1_t> ProcessInfoVector;
#endif

NVML_CALL(nvmlInit());
for (std::pair<const int, size_t>& dev_id_total_alloc_pair : gpu_dev_id_total_alloc_map) {
unsigned info_count = 0;
ProcessInfoVector infos(info_count);
std::vector<NvmlProcessInfo> infos(info_count);

NVML_CALL(nvmlDeviceGetHandleByIndex(dev_id_total_alloc_pair.first, &nvml_device));
// The first call to `nvmlDeviceGetComputeRunningProcesses` is to set the
// size of info. Since `NVML_ERROR_INSUFFICIENT_SIZE` will always be
// returned, we do not wrap the function call with `NVML_CALL`.
nvmlDeviceGetComputeRunningProcesses(nvml_device, &info_count, infos.data());
infos = ProcessInfoVector(info_count);
infos.resize(info_count);
NVML_CALL(nvmlDeviceGetComputeRunningProcesses(nvml_device, &info_count, infos.data()));

bool amend_made = false;
Expand Down