Skip to content

Commit

Permalink
Fix incorrect memory usage information on nvidia drivers 510.39+
Browse files Browse the repository at this point in the history
nvmlDeviceGetMemoryInfo_v2 was added in driver 510.39.01, but breaking
the v1 API with no backward compatibility. A corresponding version of
pynvml (11.510.69+) is needed to use the v2 API, in order to get the
correct memory usage information in nvidia drivers 510.39 or higher.

Fixes #141.
  • Loading branch information
wookayin committed Dec 1, 2022
1 parent 56a9dcf commit efa355a
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 12 deletions.
6 changes: 4 additions & 2 deletions gpustat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,9 @@ def get_process_info(nv_process):
fan_speed = None # Not supported

try:
memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes
# memory: in Bytes
# Note that this is a compat-patched API (see gpustat.nvml)
memory = N.nvmlDeviceGetMemoryInfo(handle)
except N.NVMLError as e:
log.add_exception("memory", e)
memory = None # Not supported
Expand All @@ -479,7 +481,7 @@ def get_process_info(nv_process):
try:
utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
except N.NVMLError as e:
log.add_exception("utilization_dnc", e)
log.add_exception("utilization_dec", e)
utilization_dec = None # Not supported

try:
Expand Down
46 changes: 44 additions & 2 deletions gpustat/nvml.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Imports pynvml with sanity checks and custom patches."""

import warnings
import functools
import os
import sys
Expand Down Expand Up @@ -54,6 +55,7 @@
# See #107, #141, and test_gpustat.py for more details.

_original_nvmlGetFunctionPointer = pynvml._nvmlGetFunctionPointer
_original_nvmlDeviceGetMemoryInfo = pynvml.nvmlDeviceGetMemoryInfo


class pynvml_monkeypatch:
Expand Down Expand Up @@ -101,9 +103,49 @@ def _nvmlGetFunctionPointer(name):

return ret

@staticmethod # Note: must be defined as a staticmethod to allow mocking.
def original_nvmlDeviceGetMemoryInfo(*args, **kwargs):
return _original_nvmlDeviceGetMemoryInfo(*args, **kwargs)

has_memoryinfo_v2 = None

setattr(pynvml, '_nvmlGetFunctionPointer',
pynvml_monkeypatch._nvmlGetFunctionPointer)
@staticmethod
@functools.wraps(pynvml.nvmlDeviceGetMemoryInfo)
def nvmlDeviceGetMemoryInfo(handle):
"""A patched version of nvmlDeviceGetMemoryInfo.
This tries `version=N.nvmlMemory_v2` if the nvmlDeviceGetMemoryInfo_v2
function is available (for driver >= 515), or fallback to the legacy
v1 API for (driver < 515) to yield a correct result. See #141.
"""
if pynvml_monkeypatch.has_memoryinfo_v2 is not None:
try:
pynvml._nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo_v2")
pynvml_monkeypatch.has_memoryinfo_v2 = True
except pynvml.NVMLError_FunctionNotFound: # type: ignore
pynvml_monkeypatch.has_memoryinfo_v2 = False

if hasattr(pynvml, 'nvmlMemory_v2'): # pynvml >= 11.510.69
try:
memory = pynvml_monkeypatch.original_nvmlDeviceGetMemoryInfo(
handle, version=pynvml.nvmlMemory_v2)
except pynvml.NVMLError_FunctionNotFound: # type: ignore
# pynvml >= 11.510 but driver is old (<515.39)
memory = pynvml_monkeypatch.original_nvmlDeviceGetMemoryInfo(handle)
else:
if pynvml_monkeypatch.has_memoryinfo_v2:
warnings.warn(
"Your NVIDIA driver requires a compatible version of "
"pynvml (>= 11.510.69) installed to display the correct "
"memory usage information (See #141 for more details). "
"Please try `pip install --upgrade pynvml`.")
memory = pynvml_monkeypatch.original_nvmlDeviceGetMemoryInfo(handle)

return memory


setattr(pynvml, '_nvmlGetFunctionPointer', pynvml_monkeypatch._nvmlGetFunctionPointer)
setattr(pynvml, 'nvmlDeviceGetMemoryInfo', pynvml_monkeypatch.nvmlDeviceGetMemoryInfo)


__all__ = ['pynvml']
86 changes: 78 additions & 8 deletions gpustat/test_gpustat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import psutil
import pytest
from mockito import mock, unstub, when, when2
from mockito import mock, unstub, when, when2, ANY

import gpustat
from gpustat.nvml import pynvml, pynvml_monkeypatch
Expand All @@ -29,6 +29,8 @@ def remove_ansi_codes(s):

# -----------------------------------------------------------------------------

mock_gpu_handles = [types.SimpleNamespace(value='mock-handle-%d' % i, index=i)
for i in range(3)]

def _configure_mock(N=pynvml,
_scenario_nonexistent_pid=False, # GH-95
Expand All @@ -49,8 +51,6 @@ def _configure_mock(N=pynvml,
when(N)._nvmlGetFunctionPointer(...).thenCallOriginalImplementation()

NUM_GPUS = 3
mock_handles = [types.SimpleNamespace(value='mock-handle-%d' % i, index=i)
for i in range(3)]
when(N).nvmlDeviceGetCount().thenReturn(NUM_GPUS)

def _return_or_raise(v):
Expand All @@ -63,7 +63,7 @@ def _callable(*args, **kwargs):
return _callable

for i in range(NUM_GPUS):
handle = mock_handles[i]
handle = mock_gpu_handles[i]
if _scenario_failing_one_gpu and i == 2: # see #81, #125
assert (_scenario_failing_one_gpu is N.NVMLError_Unknown or
_scenario_failing_one_gpu is N.NVMLError_GpuIsLost)
Expand Down Expand Up @@ -99,13 +99,18 @@ def _callable(*args, **kwargs):
0: 250000, 1: 250000, 2: N.NVMLError_NotSupported()
}[i]))

mock_memory_t = namedtuple("Memory_t", ['total', 'used'])
# see also: NvidiaDriverMock
mock_memory_t = namedtuple("Memory_t", ['total', 'used']) # c_nvmlMemory_t
when(N).nvmlDeviceGetMemoryInfo(handle)\
.thenAnswer(_return_or_raise({
0: mock_memory_t(total=12883853312, used=8000*MB),
1: mock_memory_t(total=12781551616, used=9000*MB),
2: mock_memory_t(total=12781551616, used=0),
}[i]))
# this mock function assumes <510.39 behavior (#141)
when(N, strict=False)\
.nvmlDeviceGetMemoryInfo(handle, version=ANY())\
.thenRaise(N.NVMLError_FunctionNotFound)

mock_utilization_t = namedtuple("Utilization_t", ['gpu', 'memory'])
when(N).nvmlDeviceGetUtilizationRates(handle)\
Expand Down Expand Up @@ -273,6 +278,7 @@ class NvidiaDriverMock:
Relevant github issues:
#107: nvmlDeviceGetComputeRunningProcesses_v2 added
#141: nvmlDeviceGetMemoryInfo (v1) broken for 510.39.01+
"""
INSTANCES = []

Expand All @@ -281,6 +287,10 @@ def __init__(self, name, **kwargs):
self.feat = kwargs

def __call__(self, N):
self.mock_processes(N)
self.mock_memoryinfo(N)

def mock_processes(self, N):
when(N).nvmlDeviceGetComputeRunningProcesses(...).thenCallOriginalImplementation()
when(N).nvmlDeviceGetGraphicsRunningProcesses(...).thenCallOriginalImplementation()
when(N).nvmlSystemGetDriverVersion().thenReturn(self.name)
Expand Down Expand Up @@ -341,6 +351,57 @@ def _nvmlDeviceGetGraphicsRunningProcesses_v2(handle, c_count, c_procs):
else:
stub.thenRaise(pynvml.NVMLError(pynvml.NVML_ERROR_FUNCTION_NOT_FOUND))

def mock_memoryinfo(self, N):
nvmlMemory_v2 = 0x02000028
if self.nvmlDeviceGetMemoryInfo_v == 1:
mock_memory_t = namedtuple(
"c_nvmlMemory_t",
['total', 'used'],
)
elif self.nvmlDeviceGetMemoryInfo_v == 2:
mock_memory_t = namedtuple(
"c_nvmlMemory_v2_t",
['version', 'total', 'reserved', 'free', 'used'],
)
mock_memory_t.__new__.__defaults__ = (nvmlMemory_v2, 0, 0, 0, 0)
else:
raise NotImplementedError

# simulates drivers >= 510.39, where memoryinfo v2 is introduced
if self.nvmlDeviceGetMemoryInfo_v == 2:
for handle in mock_gpu_handles:
# a correct API requires version=... parameter
# this assumes nvidia driver is also recent enough.
when(pynvml_monkeypatch, strict=False)\
.original_nvmlDeviceGetMemoryInfo(handle, version=nvmlMemory_v2)\
.thenReturn({
0: mock_memory_t(total=12883853312, used=8000*MB),
1: mock_memory_t(total=12781551616, used=9000*MB),
2: mock_memory_t(total=12781551616, used=0),
}[handle.index])
# simulate #141: without the v2 parameter, gives wrong result
when(pynvml_monkeypatch)\
.original_nvmlDeviceGetMemoryInfo(handle)\
.thenReturn({
0: mock_memory_t(total=12883853312, used=8099*MB),
1: mock_memory_t(total=12781551616, used=9099*MB),
2: mock_memory_t(total=12781551616, used=99*MB),
}[handle.index])

else: # old drivers < 510.39
for handle in mock_gpu_handles:
# when pynvml>=11.510, v2 API can be called but can't be used
when(N, strict=False)\
.nvmlDeviceGetMemoryInfo(handle, version=ANY())\
.thenRaise(N.NVMLError_FunctionNotFound)
# The v1 API will give a correct result for the v1 API
when(N).nvmlDeviceGetMemoryInfo(handle)\
.thenReturn({
0: mock_memory_t(total=12883853312, used=8000*MB),
1: mock_memory_t(total=12781551616, used=9000*MB),
2: mock_memory_t(total=12781551616, used=0),
}[handle.index])

def __getattr__(self, k):
return self.feat[k]

Expand All @@ -353,9 +414,18 @@ def __repr__(self):


NvidiaDriverMock.INSTANCES = [
NvidiaDriverMock('430.xx.xx', nvmlDeviceGetComputeRunningProcesses_v=1),
NvidiaDriverMock('450.66', nvmlDeviceGetComputeRunningProcesses_v=2),
NvidiaDriverMock('510.39.01', nvmlDeviceGetComputeRunningProcesses_v=3),
NvidiaDriverMock('430.xx.xx',
nvmlDeviceGetComputeRunningProcesses_v=1,
nvmlDeviceGetMemoryInfo_v=1,
),
NvidiaDriverMock('450.66',
nvmlDeviceGetComputeRunningProcesses_v=2,
nvmlDeviceGetMemoryInfo_v=1,
),
NvidiaDriverMock('510.39.01',
nvmlDeviceGetComputeRunningProcesses_v=3,
nvmlDeviceGetMemoryInfo_v=2,
),
]


Expand Down

0 comments on commit efa355a

Please sign in to comment.