NVIDIA · leofang · Sep 24, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1917,13 +1917,19 @@ cdef cudaError_t getLocalRuntimeVersion(int* runtimeVersion) except ?cudaErrorCa
     cdef cudaError_t err = cudaSuccess
     err = (<cudaError_t (*)(int*) except ?cudaErrorCallRequiresNewerDriver nogil> __cudaRuntimeGetVersion)(runtimeVersion)
 
-    # Unload
-    {{if 'Windows' == platform.system()}}
-    windll.FreeLibrary(handle)
-    {{else}}
-    dlfcn.dlclose(handle)
-    {{endif}}
+    # We explicitly do *NOT* cleanup the library handle here, acknowledging
+    # that, yes, the handle leaks. The reason is that there's a
+    # `functools.cache` on the top-level caller of this function.
+    #
+    # This means this library would be opened once and then immediately closed,
+    # all the while remaining in the cache lurking there for people to call.
+    #
+    # Since we open the library one time (technically once per unique library name),
+    # there's not a ton of leakage, which we deem acceptable for the 1000x speedup
+    # achieved by caching (ultimately) `ctypes.CDLL` calls.
+    #
+    # Long(er)-term we can explore cleaning up the library using higher-level
+    # Python mechanisms, like `__del__` or `weakref.finalizer`s.
 
-    # Return
     return err
 {{endif}}
diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py
@@ -1404,10 +1404,12 @@ def test_struct_pointer_comparison(target):
 
 
 def test_getLocalRuntimeVersion():
-    try:
-        err, version = cudart.getLocalRuntimeVersion()
-    except pathfinder.DynamicLibNotFoundError:
-        pytest.skip("cudart dynamic lib not available")
-    else:
-        assertSuccess(err)
-        assert version >= 12000  # CUDA 12.0
+    # verify that successive calls do not segfault the interpreter
+    for _ in range(10):
+        try:
+            err, version = cudart.getLocalRuntimeVersion()
+        except pathfinder.DynamicLibNotFoundError:
+            pytest.skip("cudart dynamic lib not available")
+        else:
+            assertSuccess(err)
+            assert version >= 12000  # CUDA 12.0
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py
@@ -38,11 +38,20 @@ def _load_libdl() -> ctypes.CDLL:
 LIBDL.dlerror.argtypes = []
 LIBDL.dlerror.restype = ctypes.c_char_p
 
+LIBDL.dlclose.argtypes = [ctypes.c_void_p]
+LIBDL.dlclose.restype = ctypes.c_int
+
 # First appeared in 2004-era glibc. Universally correct on Linux for all practical purposes.
 RTLD_DI_LINKMAP = 2
 RTLD_DI_ORIGIN = 6
 
 
+def unload_dl(handle: ctypes.c_void_p) -> None:
+    result = LIBDL.dlclose(handle)
+    if result:
+        raise RuntimeError(LIBDL.dlerror())
+
+
 class _LinkMapLNameView(ctypes.Structure):
     """
     Prefix-only view of glibc's `struct link_map` used **solely** to read `l_name`.

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py
@@ -46,6 +46,9 @@
 kernel32.AddDllDirectory.argtypes = [ctypes.wintypes.LPCWSTR]
 kernel32.AddDllDirectory.restype = ctypes.c_void_p  # DLL_DIRECTORY_COOKIE
 
+kernel32.FreeLibrary.argtypes = [ctypes.wintypes.HMODULE]
+kernel32.FreeLibrary.restype = ctypes.c_bool
+
 
 def ctypes_handle_to_unsigned_int(handle: ctypes.wintypes.HMODULE) -> int:
     """Convert ctypes HMODULE to unsigned int."""
@@ -157,3 +160,9 @@ def load_with_abs_path(libname: str, found_path: str) -> LoadedDL:
         raise RuntimeError(f"Failed to load DLL at {found_path}: Windows error {error_code}")
 
     return LoadedDL(found_path, False, ctypes_handle_to_unsigned_int(handle))
+
+
+def unload_dl(handle: ctypes.c_void_p) -> None:
+    result = kernel32.FreeLibrary(handle)
+    if not result:
+        raise RuntimeError(f"Failed to load windows DLL with error code: {ctypes.GetLastError()}")
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py
@@ -1,9 +1,11 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import ctypes
 import functools
 import struct
 import sys
+import weakref
 
 from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
 from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL, load_dependencies
@@ -14,12 +16,14 @@
         check_if_already_loaded_from_elsewhere,
         load_with_abs_path,
         load_with_system_search,
+        unload_dl,
     )
 else:
     from cuda.pathfinder._dynamic_libs.load_dl_linux import (
         check_if_already_loaded_from_elsewhere,
         load_with_abs_path,
         load_with_system_search,
+        unload_dl,
     )
 
 
@@ -117,4 +121,13 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
             f" Currently running: {pointer_size_bits}-bit Python"
             f" {sys.version_info.major}.{sys.version_info.minor}"
         )
-    return _load_lib_no_cache(libname)
+
+    library = _load_lib_no_cache(libname)
+
+    # Ensure that the library is unloaded after GC runs on `library`
+    #
+    # We only need the address, so the rest of whatever is in `library` is free
+    # to be cleaned up. The integer address is immutable, so it gets copied
+    # upon being referenced here
+    weakref.finalize(library, unload_dl, ctypes.c_void_p(library._handle_uint))
 with gil, __symbol_lock: 
     # Load library 
     handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint 
     # Load function 
     global __nvJitLinkCreate 
     __nvJitLinkCreate = GetProcAddress(handle, 'nvJitLinkCreate') 
     global __nvJitLinkDestroy 
     __nvJitLinkDestroy = GetProcAddress(handle, 'nvJitLinkDestroy') 
     global __nvJitLinkAddData 
     __nvJitLinkAddData = GetProcAddress(handle, 'nvJitLinkAddData') 
     global __nvJitLinkAddFile 
     __nvJitLinkAddFile = GetProcAddress(handle, 'nvJitLinkAddFile') 
     global __nvJitLinkComplete 
     __nvJitLinkComplete = GetProcAddress(handle, 'nvJitLinkComplete') 
     global __nvJitLinkGetLinkedCubinSize 
     __nvJitLinkGetLinkedCubinSize = GetProcAddress(handle, 'nvJitLinkGetLinkedCubinSize') 
     global __nvJitLinkGetLinkedCubin 
     __nvJitLinkGetLinkedCubin = GetProcAddress(handle, 'nvJitLinkGetLinkedCubin') 
     global __nvJitLinkGetLinkedPtxSize 
     __nvJitLinkGetLinkedPtxSize = GetProcAddress(handle, 'nvJitLinkGetLinkedPtxSize') 
     global __nvJitLinkGetLinkedPtx 
     __nvJitLinkGetLinkedPtx = GetProcAddress(handle, 'nvJitLinkGetLinkedPtx') 
     global __nvJitLinkGetErrorLogSize 
     __nvJitLinkGetErrorLogSize = GetProcAddress(handle, 'nvJitLinkGetErrorLogSize') 
     global __nvJitLinkGetErrorLog 
     __nvJitLinkGetErrorLog = GetProcAddress(handle, 'nvJitLinkGetErrorLog') 
     global __nvJitLinkGetInfoLogSize 
     __nvJitLinkGetInfoLogSize = GetProcAddress(handle, 'nvJitLinkGetInfoLogSize') 
     global __nvJitLinkGetInfoLog 
     __nvJitLinkGetInfoLog = GetProcAddress(handle, 'nvJitLinkGetInfoLog') 
     global __nvJitLinkVersion 
     __nvJitLinkVersion = GetProcAddress(handle, 'nvJitLinkVersion') 
     __py_nvjitlink_init = True 
     return 0 
 with gil, __symbol_lock: 
     # Load library 
     handle = load_nvidia_dynamic_lib("nvJitLink")._handle_uint 
  
     # Load function 
     global __nvJitLinkCreate 
     __nvJitLinkCreate = GetProcAddress(handle, 'nvJitLinkCreate') 
  
     global __nvJitLinkDestroy 
     __nvJitLinkDestroy = GetProcAddress(handle, 'nvJitLinkDestroy') 
  
     global __nvJitLinkAddData 
     __nvJitLinkAddData = GetProcAddress(handle, 'nvJitLinkAddData') 
  
     global __nvJitLinkAddFile 
     __nvJitLinkAddFile = GetProcAddress(handle, 'nvJitLinkAddFile') 
  
     global __nvJitLinkComplete 
     __nvJitLinkComplete = GetProcAddress(handle, 'nvJitLinkComplete') 
  
     global __nvJitLinkGetLinkedCubinSize 
     __nvJitLinkGetLinkedCubinSize = GetProcAddress(handle, 'nvJitLinkGetLinkedCubinSize') 
  
     global __nvJitLinkGetLinkedCubin 
     __nvJitLinkGetLinkedCubin = GetProcAddress(handle, 'nvJitLinkGetLinkedCubin') 
  
     global __nvJitLinkGetLinkedPtxSize 
     __nvJitLinkGetLinkedPtxSize = GetProcAddress(handle, 'nvJitLinkGetLinkedPtxSize') 
  
     global __nvJitLinkGetLinkedPtx 
     __nvJitLinkGetLinkedPtx = GetProcAddress(handle, 'nvJitLinkGetLinkedPtx') 
  
     global __nvJitLinkGetErrorLogSize 
     __nvJitLinkGetErrorLogSize = GetProcAddress(handle, 'nvJitLinkGetErrorLogSize') 
  
     global __nvJitLinkGetErrorLog 
     __nvJitLinkGetErrorLog = GetProcAddress(handle, 'nvJitLinkGetErrorLog') 
  
     global __nvJitLinkGetInfoLogSize 
     __nvJitLinkGetInfoLogSize = GetProcAddress(handle, 'nvJitLinkGetInfoLogSize') 
  
     global __nvJitLinkGetInfoLog 
     __nvJitLinkGetInfoLog = GetProcAddress(handle, 'nvJitLinkGetInfoLog') 
  
     global __nvJitLinkVersion 
     __nvJitLinkVersion = GetProcAddress(handle, 'nvJitLinkVersion') 
  
     __py_nvjitlink_init = True 
     return 0 
+    return library