Fix incorrect memory usage information on nvidia drivers 510.39+

nvmlDeviceGetMemoryInfo_v2 was added in driver 510.39.01, but breaking the v1 API with no backward compatibility. A corresponding version of pynvml (11.510.69+) is needed to use the v2 API, in order to get the correct memory usage information in nvidia drivers 510.39 or higher. Fixes #141.
wookayin · Dec 1, 2022 · efa355a · efa355a
1 parent 56a9dcf
commit efa355a
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 12 deletions.
diff --git a/gpustat/core.py b/gpustat/core.py
@@ -459,7 +459,9 @@ def get_process_info(nv_process):
                 fan_speed = None  # Not supported
 
             try:
-                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
+                # memory: in Bytes
+                # Note that this is a compat-patched API (see gpustat.nvml)
+                memory = N.nvmlDeviceGetMemoryInfo(handle)
             except N.NVMLError as e:
                 log.add_exception("memory", e)
                 memory = None  # Not supported
@@ -479,7 +481,7 @@ def get_process_info(nv_process):
             try:
                 utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
             except N.NVMLError as e:
-                log.add_exception("utilization_dnc", e)
+                log.add_exception("utilization_dec", e)
                 utilization_dec = None  # Not supported
 
             try:

diff --git a/gpustat/nvml.py b/gpustat/nvml.py
@@ -1,5 +1,6 @@
 """Imports pynvml with sanity checks and custom patches."""
 
+import warnings
 import functools
 import os
 import sys
@@ -54,6 +55,7 @@
 # See #107,  #141, and test_gpustat.py for more details.
 
 _original_nvmlGetFunctionPointer = pynvml._nvmlGetFunctionPointer
+_original_nvmlDeviceGetMemoryInfo = pynvml.nvmlDeviceGetMemoryInfo
 
 
 class pynvml_monkeypatch:
@@ -101,9 +103,49 @@ def _nvmlGetFunctionPointer(name):
 
         return ret
 
+    @staticmethod  # Note: must be defined as a staticmethod to allow mocking.
+    def original_nvmlDeviceGetMemoryInfo(*args, **kwargs):
+        return _original_nvmlDeviceGetMemoryInfo(*args, **kwargs)
+
+    has_memoryinfo_v2 = None
 
-setattr(pynvml, '_nvmlGetFunctionPointer',
-        pynvml_monkeypatch._nvmlGetFunctionPointer)
+    @staticmethod
+    @functools.wraps(pynvml.nvmlDeviceGetMemoryInfo)
+    def nvmlDeviceGetMemoryInfo(handle):
+        """A patched version of nvmlDeviceGetMemoryInfo.
+
+        This tries `version=N.nvmlMemory_v2` if the nvmlDeviceGetMemoryInfo_v2
+        function is available (for driver >= 515), or fallback to the legacy
+        v1 API for (driver < 515) to yield a correct result. See #141.
+        """
+        if pynvml_monkeypatch.has_memoryinfo_v2 is not None:
+            try:
+                pynvml._nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo_v2")
+                pynvml_monkeypatch.has_memoryinfo_v2 = True
+            except pynvml.NVMLError_FunctionNotFound:  # type: ignore
+                pynvml_monkeypatch.has_memoryinfo_v2 = False
+
+        if hasattr(pynvml, 'nvmlMemory_v2'):  # pynvml >= 11.510.69
+            try:
+                memory = pynvml_monkeypatch.original_nvmlDeviceGetMemoryInfo(
+                    handle, version=pynvml.nvmlMemory_v2)
+            except pynvml.NVMLError_FunctionNotFound:  # type: ignore
+                # pynvml >= 11.510 but driver is old (<515.39)
+                memory = pynvml_monkeypatch.original_nvmlDeviceGetMemoryInfo(handle)
+        else:
+            if pynvml_monkeypatch.has_memoryinfo_v2:
+                warnings.warn(
+                    "Your NVIDIA driver requires a compatible version of "
+                    "pynvml (>= 11.510.69) installed to display the correct "
+                    "memory usage information (See #141 for more details). "
+                    "Please try `pip install --upgrade pynvml`.")
+            memory = pynvml_monkeypatch.original_nvmlDeviceGetMemoryInfo(handle)
+
+        return memory
+
+
+setattr(pynvml, '_nvmlGetFunctionPointer', pynvml_monkeypatch._nvmlGetFunctionPointer)
+setattr(pynvml, 'nvmlDeviceGetMemoryInfo', pynvml_monkeypatch.nvmlDeviceGetMemoryInfo)
 
 
 __all__ = ['pynvml']
diff --git a/gpustat/test_gpustat.py b/gpustat/test_gpustat.py
@@ -13,7 +13,7 @@
 
 import psutil
 import pytest
-from mockito import mock, unstub, when, when2
+from mockito import mock, unstub, when, when2, ANY
 
 import gpustat
 from gpustat.nvml import pynvml, pynvml_monkeypatch
@@ -29,6 +29,8 @@ def remove_ansi_codes(s):
 
 # -----------------------------------------------------------------------------
 
+mock_gpu_handles = [types.SimpleNamespace(value='mock-handle-%d' % i, index=i)
+                    for i in range(3)]
 
 def _configure_mock(N=pynvml,
                     _scenario_nonexistent_pid=False,  # GH-95
@@ -49,8 +51,6 @@ def _configure_mock(N=pynvml,
     when(N)._nvmlGetFunctionPointer(...).thenCallOriginalImplementation()
 
     NUM_GPUS = 3
-    mock_handles = [types.SimpleNamespace(value='mock-handle-%d' % i, index=i)
-                    for i in range(3)]
     when(N).nvmlDeviceGetCount().thenReturn(NUM_GPUS)
 
     def _return_or_raise(v):
@@ -63,7 +63,7 @@ def _callable(*args, **kwargs):
         return _callable
 
     for i in range(NUM_GPUS):
-        handle = mock_handles[i]
+        handle = mock_gpu_handles[i]
         if _scenario_failing_one_gpu and i == 2:  # see #81, #125
             assert (_scenario_failing_one_gpu is N.NVMLError_Unknown or
                     _scenario_failing_one_gpu is N.NVMLError_GpuIsLost)
@@ -99,13 +99,18 @@ def _callable(*args, **kwargs):
                 0: 250000, 1: 250000, 2: N.NVMLError_NotSupported()
             }[i]))
 
-        mock_memory_t = namedtuple("Memory_t", ['total', 'used'])
+        # see also: NvidiaDriverMock
+        mock_memory_t = namedtuple("Memory_t", ['total', 'used'])  # c_nvmlMemory_t
         when(N).nvmlDeviceGetMemoryInfo(handle)\
             .thenAnswer(_return_or_raise({
                 0: mock_memory_t(total=12883853312, used=8000*MB),
                 1: mock_memory_t(total=12781551616, used=9000*MB),
                 2: mock_memory_t(total=12781551616, used=0),
             }[i]))
+        # this mock function assumes <510.39 behavior (#141)
+        when(N, strict=False)\
+            .nvmlDeviceGetMemoryInfo(handle, version=ANY())\
+            .thenRaise(N.NVMLError_FunctionNotFound)
 
         mock_utilization_t = namedtuple("Utilization_t", ['gpu', 'memory'])
         when(N).nvmlDeviceGetUtilizationRates(handle)\
@@ -273,6 +278,7 @@ class NvidiaDriverMock:
 
     Relevant github issues:
         #107: nvmlDeviceGetComputeRunningProcesses_v2 added
+        #141: nvmlDeviceGetMemoryInfo (v1) broken for 510.39.01+
     """
     INSTANCES = []
 
@@ -281,6 +287,10 @@ def __init__(self, name, **kwargs):
         self.feat = kwargs
 
     def __call__(self, N):
+        self.mock_processes(N)
+        self.mock_memoryinfo(N)
+
+    def mock_processes(self, N):
         when(N).nvmlDeviceGetComputeRunningProcesses(...).thenCallOriginalImplementation()
         when(N).nvmlDeviceGetGraphicsRunningProcesses(...).thenCallOriginalImplementation()
         when(N).nvmlSystemGetDriverVersion().thenReturn(self.name)
@@ -341,6 +351,57 @@ def _nvmlDeviceGetGraphicsRunningProcesses_v2(handle, c_count, c_procs):
             else:
                 stub.thenRaise(pynvml.NVMLError(pynvml.NVML_ERROR_FUNCTION_NOT_FOUND))
 
+    def mock_memoryinfo(self, N):
+        nvmlMemory_v2 = 0x02000028
+        if self.nvmlDeviceGetMemoryInfo_v == 1:
+            mock_memory_t = namedtuple(
+                "c_nvmlMemory_t",
+                ['total', 'used'],
+            )
+        elif self.nvmlDeviceGetMemoryInfo_v == 2:
+            mock_memory_t = namedtuple(
+                "c_nvmlMemory_v2_t",
+                ['version', 'total', 'reserved', 'free', 'used'],
+            )
+            mock_memory_t.__new__.__defaults__ = (nvmlMemory_v2, 0, 0, 0, 0)
+        else:
+            raise NotImplementedError
+
+        # simulates drivers >= 510.39, where memoryinfo v2 is introduced
+        if self.nvmlDeviceGetMemoryInfo_v == 2:
+            for handle in mock_gpu_handles:
+                # a correct API requires version=... parameter
+                # this assumes nvidia driver is also recent enough.
+                when(pynvml_monkeypatch, strict=False)\
+                    .original_nvmlDeviceGetMemoryInfo(handle, version=nvmlMemory_v2)\
+                    .thenReturn({
+                        0: mock_memory_t(total=12883853312, used=8000*MB),
+                        1: mock_memory_t(total=12781551616, used=9000*MB),
+                        2: mock_memory_t(total=12781551616, used=0),
+                    }[handle.index])
+                # simulate #141: without the v2 parameter, gives wrong result
+                when(pynvml_monkeypatch)\
+                    .original_nvmlDeviceGetMemoryInfo(handle)\
+                    .thenReturn({
+                        0: mock_memory_t(total=12883853312, used=8099*MB),
+                        1: mock_memory_t(total=12781551616, used=9099*MB),
+                        2: mock_memory_t(total=12781551616, used=99*MB),
+                    }[handle.index])
+
+        else:  # old drivers < 510.39
+            for handle in mock_gpu_handles:
+                # when pynvml>=11.510, v2 API can be called but can't be used
+                when(N, strict=False)\
+                    .nvmlDeviceGetMemoryInfo(handle, version=ANY())\
+                    .thenRaise(N.NVMLError_FunctionNotFound)
+                # The v1 API will give a correct result for the v1 API
+                when(N).nvmlDeviceGetMemoryInfo(handle)\
+                    .thenReturn({
+                        0: mock_memory_t(total=12883853312, used=8000*MB),
+                        1: mock_memory_t(total=12781551616, used=9000*MB),
+                        2: mock_memory_t(total=12781551616, used=0),
+                    }[handle.index])
+
     def __getattr__(self, k):
         return self.feat[k]
 
@@ -353,9 +414,18 @@ def __repr__(self):
 
 
 NvidiaDriverMock.INSTANCES = [
-    NvidiaDriverMock('430.xx.xx', nvmlDeviceGetComputeRunningProcesses_v=1),
-    NvidiaDriverMock('450.66', nvmlDeviceGetComputeRunningProcesses_v=2),
-    NvidiaDriverMock('510.39.01', nvmlDeviceGetComputeRunningProcesses_v=3),
+    NvidiaDriverMock('430.xx.xx',
+                     nvmlDeviceGetComputeRunningProcesses_v=1,
+                     nvmlDeviceGetMemoryInfo_v=1,
+                     ),
+    NvidiaDriverMock('450.66',
+                     nvmlDeviceGetComputeRunningProcesses_v=2,
+                     nvmlDeviceGetMemoryInfo_v=1,
+                     ),
+    NvidiaDriverMock('510.39.01',
+                     nvmlDeviceGetComputeRunningProcesses_v=3,
+                     nvmlDeviceGetMemoryInfo_v=2,
+                     ),
 ]