[Runtime] Allow query of available device memory through DeviceAPI (#16994)

Lunderberg · web-flow · commit 3cd66738908a · 2024-05-19T08:29:43.000-05:00
* [Runtime] Allow query of available device memory through DeviceAPI

Prior to this commit, the total device memory could be queried through
the `DeviceAPI` interface, but the currently available device memory
could not.  This functionality may be useful for debugging, or for
validating available memory prior to model execution.

This commit implements the property `Device.available_global_memory`,
which queries the `DeviceAttrKind::kAvailableGlobalMemory`.  Support
for this query, like all device attribute queries, may vary across
different backends, and will return `None` for backends that do not
support this query.  This commit only currently implements support for
`kAvailableGlobalMemory` for TVM's Cuda backend.

* Updated docstring to fix copy/paste typo

* Lint fix, cover all enum values in case/switch

* Fix rocm compilation warning
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
@@ -51,6 +51,7 @@ enum DeviceAttrKind : int {
   kDriverVersion = 12,
   kL2CacheSizeBytes = 13,
   kTotalGlobalMemory = 14,
+  kAvailableGlobalMemory = 15,
 };
 
 #ifdef TVM_KALLOC_ALIGNMENT
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
@@ -539,11 +539,25 @@ def total_global_memory(self):
         Returns
         -------
         total_global_memory : int or None
-            Return the global memory available on device in bytes.
+            Return the total size of global memory on device in bytes.
             Return None if the device does not support this feature.
         """
         return self._GetDeviceAttr(self.device_type, self.device_id, 14)
 
+    @property
+    def available_global_memory(self):
+        """Return size of the available global memory.
+
+        Supported devices include CUDA.
+
+        Returns
+        -------
+        available_global_memory : int or None
+            Return the amount of unallocated global memory on device in bytes.
+            Return None if the device does not support this feature.
+        """
+        return self._GetDeviceAttr(self.device_type, self.device_id, 15)
+
     def texture_spatial_limit(self):
         """Returns limits for textures by spatial dimensions
 
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
@@ -121,6 +121,12 @@ class CUDADeviceAPI final : public DeviceAPI {
         *rv = total_global_memory;
         return;
       }
+      case kAvailableGlobalMemory: {
+        size_t free_mem, total_mem;
+        CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+        *rv = static_cast<int64_t>(free_mem);
+        return;
+      }
     }
     *rv = value;
   }
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
@@ -214,6 +214,12 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = static_cast<int64_t>(total_global_memory);
       return;
     }
+
+    case kAvailableGlobalMemory:
+      // Not currently implemented.  Based on
+      // https://stackoverflow.com/a/3568223, may not be implementable
+      // at all through OpenCL API.
+      break;
   }
 }
 
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
@@ -136,6 +136,10 @@ class ROCMDeviceAPI final : public DeviceAPI {
         *rv = total_global_memory;
         return;
       }
+
+      case kAvailableGlobalMemory:
+        // Not currently implemented.
+        break;
     }
     *rv = value;
   }
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
@@ -168,6 +168,11 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = device(index).compute_memory_size;
       return;
     }
+
+    case kAvailableGlobalMemory:
+      // Not currently implemented.  Will only be implementable for
+      // devices that support the VK_EXT_memory_budget extension.
+      break;
   }
 }
 
diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -16,33 +16,63 @@
 # under the License.
 """Basic runtime enablement test."""
 
-import tvm
-from tvm import te
+import math
+
+import pytest
 import numpy as np
+
+import tvm
 import tvm.testing
+from tvm import te
+
+dtype = tvm.testing.parameter("uint8", "int8", "uint16", "int16", "uint32", "int32", "float32")
+
+
+def test_nd_create(target, dev, dtype):
+    x = np.random.randint(0, 10, size=(3, 4))
+    x = np.array(x, dtype=dtype)
+    y = tvm.nd.array(x, device=dev)
+    z = y.copyto(dev)
+    assert y.dtype == x.dtype
+    assert y.shape == x.shape
+    assert isinstance(y, tvm.nd.NDArray)
+    np.testing.assert_equal(x, y.numpy())
+    np.testing.assert_equal(x, z.numpy())
+
+    # no need here, just to test usablity
+    dev.sync()
+
+
+def test_memory_usage(target, dev, dtype):
+    available_memory_before = dev.available_global_memory
+    if available_memory_before is None:
+        pytest.skip(reason=f"Target '{target}' does not support queries of available memory")
+
+    arr = tvm.nd.empty([1024, 1024], dtype=dtype, device=dev)
+    available_memory_after = dev.available_global_memory
+
+    num_elements = math.prod(arr.shape)
+    element_nbytes = tvm.runtime.DataType(dtype).itemsize()
+    expected_memory_after = available_memory_before - num_elements * element_nbytes
+
+    # Allocations may be padded out to provide alignment, to match a
+    # page boundary, due to additional device-side bookkeeping
+    # required by the TVM backend or the driver, etc.  Therefore, the
+    # available memory may decrease by more than the requested amount.
+    assert available_memory_after <= expected_memory_after
 
+    # TVM's NDArray type is a reference-counted handle to the
+    # underlying reference.  After the last reference to an NDArray is
+    # cleared, the backing allocation will be freed.
+    del arr
 
-@tvm.testing.uses_gpu
-def test_nd_create():
-    for target, dev in tvm.testing.enabled_targets():
-        for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32", "float32"]:
-            x = np.random.randint(0, 10, size=(3, 4))
-            x = np.array(x, dtype=dtype)
-            y = tvm.nd.array(x, device=dev)
-            z = y.copyto(dev)
-            assert y.dtype == x.dtype
-            assert y.shape == x.shape
-            assert isinstance(y, tvm.nd.NDArray)
-            np.testing.assert_equal(x, y.numpy())
-            np.testing.assert_equal(x, z.numpy())
-        # no need here, just to test usablity
-        dev.sync()
+    assert dev.available_global_memory == available_memory_before
 
 
 def test_fp16_conversion():
     n = 100
 
-    for (src, dst) in [("float32", "float16"), ("float16", "float32")]:
+    for src, dst in [("float32", "float16"), ("float16", "float32")]:
         A = te.placeholder((n,), dtype=src)
         B = te.compute((n,), lambda i: A[i].astype(dst))
 
@@ -66,6 +96,4 @@ def test_dtype():
 
 
 if __name__ == "__main__":
-    test_nd_create()
-    test_fp16_conversion()
-    test_dtype()
+    tvm.testing.main()

Original file line number	Diff line number	Diff line change
`@@ -121,6 +121,12 @@ class CUDADeviceAPI final : public DeviceAPI {`
`121`	`121`	`*rv = total_global_memory;`
`122`	`122`	`return;`
`123`	`123`	`}`
	`124`	`+ case kAvailableGlobalMemory: {`
	`125`	`+ size_t free_mem, total_mem;`
	`126`	`+ CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));`
	`127`	`+ *rv = static_cast<int64_t>(free_mem);`
	`128`	`+ return;`
	`129`	`+ }`
`124`	`130`	`}`
`125`	`131`	`*rv = value;`
`126`	`132`	`}`
Original file line number	Diff line number	Diff line change
`@@ -214,6 +214,12 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)`
`214`	`214`	`*rv = static_cast<int64_t>(total_global_memory);`
`215`	`215`	`return;`
`216`	`216`	`}`
	`217`	`+`
	`218`	`+ case kAvailableGlobalMemory:`
	`219`	`+ // Not currently implemented. Based on`
	`220`	`+ // https://stackoverflow.com/a/3568223, may not be implementable`
	`221`	`+ // at all through OpenCL API.`
	`222`	`+ break;`
`217`	`223`	`}`
`218`	`224`	`}`
`219`	`225`
Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,10 @@ class ROCMDeviceAPI final : public DeviceAPI {`
`136`	`136`	`*rv = total_global_memory;`
`137`	`137`	`return;`
`138`	`138`	`}`
	`139`	`+`
	`140`	`+ case kAvailableGlobalMemory:`
	`141`	`+ // Not currently implemented.`
	`142`	`+ break;`
`139`	`143`	`}`
`140`	`144`	`*rv = value;`
`141`	`145`	`}`
Original file line number	Diff line number	Diff line change
`@@ -168,6 +168,11 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)`
`168`	`168`	`*rv = device(index).compute_memory_size;`
`169`	`169`	`return;`
`170`	`170`	`}`
	`171`	`+`
	`172`	`+ case kAvailableGlobalMemory:`
	`173`	`+ // Not currently implemented. Will only be implementable for`
	`174`	`+ // devices that support the VK_EXT_memory_budget extension.`
	`175`	`+ break;`
`171`	`176`	`}`
`172`	`177`	`}`
`173`	`178`