From 1f84a5aa1281d0d21164ca85b4df415fb9ee3e24 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Oct 2025 17:42:20 +0000 Subject: [PATCH 01/18] release gil --- cuda_core/cuda/core/experimental/_device.pyx | 79 +++++++++++-------- cuda_core/cuda/core/experimental/_event.pyx | 15 ++-- cuda_core/cuda/core/experimental/_stream.pyx | 47 ++++++----- .../core/experimental/_utils/cuda_utils.pxd | 4 +- .../core/experimental/_utils/cuda_utils.pyx | 31 ++++---- 5 files changed, 104 insertions(+), 72 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index ae1c7f38c5..701516f53b 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -55,7 +55,8 @@ cdef class DeviceProperties: cdef inline _get_attribute(self, cydriver.CUdevice_attribute attr): """Retrieve the attribute value directly from the driver.""" cdef int val - HANDLE_RETURN(cydriver.cuDeviceGetAttribute(&val, attr, self._handle)) + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetAttribute(&val, attr, self._handle)) return val cdef _get_cached_attribute(self, attr): @@ -949,7 +950,8 @@ cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL: primary_ctxs = _tls.primary_ctxs = [0] * total cdef cydriver.CUcontext ctx = (primary_ctxs[dev_id]) if ctx == NULL: - HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) + with nogil: + HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) primary_ctxs[dev_id] = (ctx) return ctx @@ -985,7 +987,7 @@ class Device: def __new__(cls, device_id: Optional[int] = None): global _is_cuInit if _is_cuInit is False: - with _lock: + with _lock, nogil: HANDLE_RETURN(cydriver.cuInit(0)) _is_cuInit = True @@ -993,11 +995,13 @@ class Device: cdef cydriver.CUdevice dev cdef cydriver.CUcontext ctx if device_id is None: - err = cydriver.cuCtxGetDevice(&dev) + with nogil: + err = cydriver.cuCtxGetDevice(&dev) if err == cydriver.CUresult.CUDA_SUCCESS: device_id = int(dev) elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) assert (ctx) == NULL device_id = 0 # cudart behavior else: @@ -1010,18 +1014,20 @@ class Device: try: devices = _tls.devices except AttributeError: - HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) devices = _tls.devices = [] for dev_id in range(total): device = super().__new__(cls) device._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - HANDLE_RETURN( - cydriver.cuDeviceGetAttribute( - &attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id + with nogil: + HANDLE_RETURN( + cydriver.cuDeviceGetAttribute( + &attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id + ) ) - ) if attr == 1: device._mr = DeviceMemoryResource(dev_id) else: @@ -1042,16 +1048,18 @@ class Device: f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" ) - def _get_current_context(self, check_consistency=False) -> driver.CUcontext: + def _get_current_context(self, bint check_consistency=False) -> driver.CUcontext: cdef cydriver.CUcontext ctx - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) - if ctx == NULL: - raise CUDAError("No context is bound to the calling CPU thread.") cdef cydriver.CUdevice dev - if check_consistency: - HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) - if (dev) != self._id: - raise CUDAError("Internal error (current device is not equal to Device.device_id)") + cdef cydriver.CUdevice this_dev = self._id + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + if ctx == NULL: + raise CUDAError("No context is bound to the calling CPU thread.") + if check_consistency: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + if dev != this_dev: + raise CUDAError("Internal error (current device is not equal to Device.device_id)") return driver.CUcontext(ctx) @property @@ -1080,10 +1088,12 @@ class Device: """ cdef cydriver.CUuuid uuid - IF CUDA_CORE_BUILD_MAJOR == "12": - HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id)) - ELSE: # 13.0+ - HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id)) + cdef cydriver.CUdevice this_dev = self._id + with nogil: + IF CUDA_CORE_BUILD_MAJOR == "12": + HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, this_dev)) + ELSE: # 13.0+ + HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, this_dev)) cdef bytes uuid_b = cpython.PyBytes_FromStringAndSize(uuid.bytes, sizeof(uuid.bytes)) cdef str uuid_hex = uuid_b.hex() # 8-4-4-4-12 @@ -1095,7 +1105,10 @@ class Device: # Use 256 characters to be consistent with CUDA Runtime cdef int LENGTH = 256 cdef bytes name = bytes(LENGTH) - HANDLE_RETURN(cydriver.cuDeviceGetName(name, LENGTH, self._id)) + cdef char* name_ptr = name + cdef cydriver.CUdevice this_dev = self._id + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetName(name_ptr, LENGTH, this_dev)) name = name.split(b"\0")[0] return name.decode() @@ -1198,7 +1211,8 @@ class Device: >>> # ... do work on device 0 ... """ - cdef cydriver.CUcontext _ctx + cdef cydriver.CUcontext prev_ctx + cdef cydriver.CUcontext curr_ctx if ctx is not None: # TODO: revisit once Context is cythonized assert_type(ctx, Context) @@ -1207,16 +1221,19 @@ class Device: "the provided context was created on the device with" f" id={ctx._id}, which is different from the target id={self._id}" ) - # _ctx is the previous context - HANDLE_RETURN(cydriver.cuCtxPopCurrent(&_ctx)) - HANDLE_RETURN(cydriver.cuCtxPushCurrent((ctx._handle))) + # prev_ctx is the previous context + curr_ctx = (ctx._handle) + with nogil: + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx)) + HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) self._has_inited = True - if _ctx != NULL: - return Context._from_ctx((_ctx), self._id) + if prev_ctx != NULL: + return Context._from_ctx((prev_ctx), self._id) else: # use primary ctx - _ctx = _get_primary_context(self._id) - HANDLE_RETURN(cydriver.cuCtxSetCurrent(_ctx)) + curr_ctx = _get_primary_context(self._id) + with nogil: + HANDLE_RETURN(cydriver.cuCtxSetCurrent(curr_ctx)) self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index db243717f6..f2dde21bd3 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -109,7 +109,8 @@ cdef class Event: self._busy_waited = True if opts.support_ipc: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") - HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) + with nogil: + HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) self._device_id = device_id self._ctx_handle = ctx_handle return self @@ -118,7 +119,8 @@ cdef class Event: if is_shutting_down and is_shutting_down(): return if self._handle != NULL: - HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) + with nogil: + HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) self._handle = (NULL) cpdef close(self): @@ -137,7 +139,8 @@ cdef class Event: def __sub__(self, other: Event): # return self - other (in milliseconds) cdef float timing - err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) + with nogil: + err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) if err == 0: return timing else: @@ -187,12 +190,14 @@ cdef class Event: has been completed. """ - HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle)) + with nogil: + HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle)) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" - result = cydriver.cuEventQuery(self._handle) + with nogil: + result = cydriver.cuEventQuery(self._handle) if result == cydriver.CUresult.CUDA_SUCCESS: return True if result == cydriver.CUresult.CUDA_ERROR_NOT_READY: diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 737fd13f95..4a0880338d 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -177,16 +177,21 @@ cdef class Stream: priority = opts.priority flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT + # TODO: we might want to consider memoizing high/low per CUDA context and avoid this call cdef int high, low - HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low)) + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low)) + cdef int prio if priority is not None: - if not (low <= priority <= high): + prio = priority + if not (low <= prio <= high): raise ValueError(f"{priority=} is out of range {[low, high]}") else: - priority = high + prio = high cdef cydriver.CUstream s - HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, priority)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio)) self._handle = s self._owner = None self._nonblocking = nonblocking @@ -204,7 +209,8 @@ cdef class Stream: if self._owner is None: if self._handle and not self._builtin: - HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle)) else: self._owner = None self._handle = (NULL) @@ -238,7 +244,8 @@ cdef class Stream: """Return True if this is a nonblocking stream, otherwise False.""" cdef unsigned int flags if self._nonblocking is None: - HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING: self._nonblocking = True else: @@ -250,13 +257,15 @@ cdef class Stream: """Return the stream priority.""" cdef int prio if self._priority is None: - HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" - HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle)) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -299,11 +308,12 @@ cdef class Stream: """ cdef cydriver.CUevent event cdef cydriver.CUstream stream - cdef bint discard_event if isinstance(event_or_stream, Event): event = (event_or_stream.handle) - discard_event = False + with nogil: + # TODO: support flags other than 0? + HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) else: if isinstance(event_or_stream, Stream): stream = (event_or_stream.handle) @@ -316,14 +326,12 @@ cdef class Stream: f" got {type(event_or_stream)}" ) from e stream = (s.handle) - HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) - discard_event = True - - # TODO: support flags other than 0? - HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) - if discard_event: - HANDLE_RETURN(cydriver.cuEventDestroy(event)) + with nogil: + HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) + HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) + # TODO: support flags other than 0? + HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) + HANDLE_RETURN(cydriver.cuEventDestroy(event)) @property def device(self) -> Device: @@ -344,7 +352,8 @@ cdef class Stream: # TODO: consider making self._ctx_handle typed? cdef cydriver.CUcontext ctx if self._ctx_handle is None: - HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &ctx)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &ctx)) self._ctx_handle = driver.CUcontext(ctx) return 0 diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index bf570965f9..c04fc8497f 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -12,11 +12,11 @@ ctypedef fused supported_error_type: cydriver.CUresult -cdef int HANDLE_RETURN(supported_error_type err) except?-1 +cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil # TODO: stop exposing these within the codebase? -cpdef int _check_driver_error(error) except?-1 +cpdef int _check_driver_error(cydriver.CUresult error) except?-1 nogil cpdef int _check_runtime_error(error) except?-1 cpdef int _check_nvrtc_error(error) except?-1 diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index c095e75645..def486595c 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -52,32 +52,33 @@ def _reduce_3_tuple(t: tuple): return t[0] * t[1] * t[2] -cdef int HANDLE_RETURN(supported_error_type err) except?-1: +cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil: if supported_error_type is cydriver.CUresult: if err != cydriver.CUresult.CUDA_SUCCESS: return _check_driver_error(err) -cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS -cpdef inline int _check_driver_error(error) except?-1: - if error == _DRIVER_SUCCESS: +cpdef inline int _check_driver_error(cydriver.CUresult error) except?-1 nogil: + if error == cydriver.CUresult.CUDA_SUCCESS: return 0 - name_err, name = driver.cuGetErrorName(error) - if name_err != _DRIVER_SUCCESS: + cdef const char* name + name_err = cydriver.cuGetErrorName(error, &name) + if name_err != cydriver.CUresult.CUDA_SUCCESS: raise CUDAError(f"UNEXPECTED ERROR CODE: {error}") - name = name.decode() - expl = DRIVER_CU_RESULT_EXPLANATIONS.get(int(error)) - if expl is not None: - raise CUDAError(f"{name}: {expl}") - desc_err, desc = driver.cuGetErrorString(error) - if desc_err != _DRIVER_SUCCESS: - raise CUDAError(f"{name}") - desc = desc.decode() - raise CUDAError(f"{name}: {desc}") + with gil: + # TODO: consider lower this to Cython + expl = DRIVER_CU_RESULT_EXPLANATIONS.get(int(error)) + if expl is not None: + raise CUDAError(f"{name.decode()}: {expl}") + cdef const char* desc + desc_err = cydriver.cuGetErrorString(error, &desc) + if desc_err != cydriver.CUresult.CUDA_SUCCESS: + raise CUDAError(f"{name.decode()}") + raise CUDAError(f"{name.decode()}: {desc.decode()}") cpdef inline int _check_runtime_error(error) except?-1: From 2c8682567a50b0b9ea5ae9d1cea895d17911545b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Oct 2025 20:48:14 +0000 Subject: [PATCH 02/18] release gil for record --- cuda_core/cuda/core/experimental/_event.pxd | 14 ++++++++++++++ cuda_core/cuda/core/experimental/_event.pyx | 7 ------- cuda_core/cuda/core/experimental/_stream.pyx | 6 ++++-- cuda_core/tests/test_stream.py | 2 +- 4 files changed, 19 insertions(+), 10 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_event.pxd diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd new file mode 100644 index 0000000000..650c525b6f --- /dev/null +++ b/cuda_core/cuda/core/experimental/_event.pxd @@ -0,0 +1,14 @@ +from cuda.bindings cimport cydriver + + +cdef class Event: + + cdef: + cydriver.CUevent _handle + bint _timing_disabled + bint _busy_waited + int _device_id + object _ctx_handle + + cdef _shutdown_safe_close(self, is_shutting_down=*) + cpdef close(self) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index f2dde21bd3..f8b1e0f252 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -81,13 +81,6 @@ cdef class Event: and they should instead be created through a :obj:`~_stream.Stream` object. """ - cdef: - cydriver.CUevent _handle - bint _timing_disabled - bint _busy_waited - int _device_id - object _ctx_handle - def __cinit__(self): self._handle = (NULL) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 4a0880338d..e419b18770 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -8,6 +8,7 @@ from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver +from cuda.core.experimental._event cimport Event as cyEvent from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, @@ -292,8 +293,9 @@ cdef class Stream: if event is None: self._get_device_and_context() event = Event._init(self._device_id, self._ctx_handle, options) - # TODO: revisit after Event is cythonized - HANDLE_RETURN(cydriver.cuEventRecord((event.handle), self._handle)) + cdef cydriver.CUevent e = ((event))._handle + with nogil: + HANDLE_RETURN(cydriver.cuEventRecord(e, self._handle)) return event def wait(self, event_or_stream: Union[Event, Stream]): diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index e5ed99acfb..845522cb43 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -51,7 +51,7 @@ def test_stream_record(init_cuda): def test_stream_record_invalid_event(init_cuda): stream = Device().create_stream(options=StreamOptions()) - with pytest.raises(AttributeError): + with pytest.raises(TypeError): stream.record(event="invalid_event") From efe7fe00fd889845245866e1c123c381591da37f Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Oct 2025 21:28:51 +0000 Subject: [PATCH 03/18] use __dealloc__ in event/stream --- cuda_core/cuda/core/experimental/_event.pxd | 1 - cuda_core/cuda/core/experimental/_event.pyx | 15 ++++--------- cuda_core/cuda/core/experimental/_stream.pyx | 22 +++++++------------- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd index 650c525b6f..48a9257776 100644 --- a/cuda_core/cuda/core/experimental/_event.pxd +++ b/cuda_core/cuda/core/experimental/_event.pxd @@ -10,5 +10,4 @@ cdef class Event: int _device_id object _ctx_handle - cdef _shutdown_safe_close(self, is_shutting_down=*) cpdef close(self) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index f8b1e0f252..962556597a 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -20,9 +20,7 @@ from cuda.core.experimental._context import Context from cuda.core.experimental._utils.cuda_utils import ( CUDAError, driver, - handle_return, ) -import sys if TYPE_CHECKING: import cuda.bindings from cuda.core.experimental._device import Device @@ -108,20 +106,15 @@ cdef class Event: self._ctx_handle = ctx_handle return self - cdef _shutdown_safe_close(self, is_shutting_down=sys.is_finalizing): - if is_shutting_down and is_shutting_down(): - return + cpdef close(self): + """Destroy the event.""" if self._handle != NULL: with nogil: HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) self._handle = (NULL) - cpdef close(self): - """Destroy the event.""" - self._shutdown_safe_close(is_shutting_down=None) - - def __del__(self): - self._shutdown_safe_close() + def __dealloc__(self): + self.close() def __isub__(self, other): return NotImplemented diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index e419b18770..367000b12e 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -14,8 +14,6 @@ from cuda.core.experimental._utils.cuda_utils cimport ( HANDLE_RETURN, ) -import sys - import cython import os import warnings @@ -202,12 +200,15 @@ cdef class Stream: return self def __del__(self): - self._shutdown_safe_close() + self.close() + + cpdef close(self): + """Destroy the stream. - cdef _shutdown_safe_close(self, is_shutting_down=sys.is_finalizing): - if is_shutting_down and is_shutting_down(): - return + Destroy the stream if we own it. Borrowed foreign stream + object will instead have their references released. + """ if self._owner is None: if self._handle and not self._builtin: with nogil: @@ -216,15 +217,6 @@ cdef class Stream: self._owner = None self._handle = (NULL) - cpdef close(self): - """Destroy the stream. - - Destroy the stream if we own it. Borrowed foreign stream - object will instead have their references released. - - """ - self._shutdown_safe_close(is_shutting_down=None) - def __cuda_stream__(self) -> tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" return (0, int(self.handle)) From fa88b286b4b111aade78875622d488a12de2f7f8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Oct 2025 21:31:15 +0000 Subject: [PATCH 04/18] nit: remove print --- cuda_core/tests/test_system.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py index b7eab9e753..d5195ed872 100644 --- a/cuda_core/tests/test_system.py +++ b/cuda_core/tests/test_system.py @@ -19,7 +19,6 @@ def test_system_singleton(): def test_driver_version(): driver_version = system.driver_version - print(driver_version) version = handle_return(driver.cuDriverGetVersion()) expected_driver_version = (version // 1000, (version % 1000) // 10) assert driver_version == expected_driver_version, "Driver version does not match expected value" From 1083d90dde158d3ab4e80ff43edf5927b5a2d167 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 3 Oct 2025 03:39:22 +0000 Subject: [PATCH 05/18] fix linter error --- cuda_core/cuda/core/experimental/_event.pxd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd index 48a9257776..0972063af3 100644 --- a/cuda_core/cuda/core/experimental/_event.pxd +++ b/cuda_core/cuda/core/experimental/_event.pxd @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + from cuda.bindings cimport cydriver From c06dc9de35595d72ef2406f848faad7a78eab505 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 3 Oct 2025 04:59:32 +0000 Subject: [PATCH 06/18] reduce further the number of Python objects held by Stream --- cuda_core/cuda/core/experimental/_device.pyx | 2 + cuda_core/cuda/core/experimental/_stream.pyx | 50 +++++++++---------- .../core/experimental/_utils/cuda_utils.pxd | 8 +++ .../core/experimental/_utils/cuda_utils.pyx | 29 ++++++----- 4 files changed, 49 insertions(+), 40 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index 701516f53b..7ab8731d30 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -27,6 +27,8 @@ from cuda.core.experimental._utils.cuda_utils import ( ) +# TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python, +# but it seems it is very convenient to expose them for testing purposes... _tls = threading.local() _lock = threading.Lock() cdef bint _is_cuInit = False diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 367000b12e..20bdc27b7c 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -11,6 +11,8 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._event cimport Event as cyEvent from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, + CU_CONTEXT_INVALID, + get_device_from_ctx, HANDLE_RETURN, ) @@ -29,8 +31,6 @@ from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( driver, - get_device_from_ctx, - handle_return, ) @@ -117,11 +117,13 @@ cdef class Stream: object _builtin object _nonblocking object _priority - object _device_id - object _ctx_handle + cydriver.CUdevice _device_id + cydriver.CUcontext _ctx_handle def __cinit__(self): self._handle = (NULL) + self._device_id = cydriver.CU_DEVICE_INVALID # delayed + self._ctx_handle = CU_CONTEXT_INVALID # delayed def __init__(self, *args, **kwargs): raise RuntimeError( @@ -137,8 +139,6 @@ cdef class Stream: self._builtin = True self._nonblocking = None # delayed self._priority = None # delayed - self._device_id = None # delayed - self._ctx_handle = None # delayed return self @classmethod @@ -149,8 +149,6 @@ cdef class Stream: self._builtin = True self._nonblocking = None # delayed self._priority = None # delayed - self._device_id = None # delayed - self._ctx_handle = None # delayed return self @classmethod @@ -167,8 +165,6 @@ cdef class Stream: self._owner = obj self._nonblocking = None # delayed self._priority = None # delayed - self._device_id = None # delayed - self._ctx_handle = None # delayed return self cdef StreamOptions opts = check_or_create_options(StreamOptions, options, "Stream options") @@ -195,8 +191,7 @@ cdef class Stream: self._owner = None self._nonblocking = nonblocking self._priority = priority - self._device_id = device_id - self._ctx_handle = None # delayed + self._device_id = device_id if device_id is not None else self._device_id return self def __del__(self): @@ -284,7 +279,7 @@ cdef class Stream: # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. if event is None: self._get_device_and_context() - event = Event._init(self._device_id, self._ctx_handle, options) + event = Event._init((self._device_id), (self._ctx_handle), options) cdef cydriver.CUevent e = ((event))._handle with nogil: HANDLE_RETURN(cydriver.cuEventRecord(e, self._handle)) @@ -340,22 +335,23 @@ cdef class Stream: """ from cuda.core.experimental._device import Device # avoid circular import self._get_device_and_context() - return Device(self._device_id) + return Device((self._device_id)) - cdef int _get_context(Stream self) except?-1: - # TODO: consider making self._ctx_handle typed? - cdef cydriver.CUcontext ctx - if self._ctx_handle is None: - with nogil: - HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &ctx)) - self._ctx_handle = driver.CUcontext(ctx) + cdef int _get_context(self) except?-1 nogil: + if self._ctx_handle == CU_CONTEXT_INVALID: + HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &(self._ctx_handle))) return 0 - cdef int _get_device_and_context(Stream self) except?-1: - if self._device_id is None: - # Get the stream context first - self._get_context() - self._device_id = get_device_from_ctx(self._ctx_handle) + cdef int _get_device_and_context(self) except?-1: + cdef cydriver.CUcontext curr_ctx + if self._device_id == cydriver.CU_DEVICE_INVALID: + # TODO: It is likely faster/safer to call cuCtxGetCurrent? + from cuda.core.experimental._device import Device # avoid circular import + curr_ctx = (Device().context._handle) + with nogil: + # Get the stream context first + self._get_context() + self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx) return 0 @property @@ -363,7 +359,7 @@ cdef class Stream: """Return the :obj:`~_context.Context` associated with this stream.""" self._get_context() self._get_device_and_context() - return Context._from_ctx(self._ctx_handle, self._device_id) + return Context._from_ctx((self._ctx_handle), (self._device_id)) @staticmethod def from_handle(handle: int) -> Stream: diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index c04fc8497f..442fc70e20 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -12,6 +12,14 @@ ctypedef fused supported_error_type: cydriver.CUresult +# mimic CU_DEVICE_INVALID +cdef cydriver.CUcontext CU_CONTEXT_INVALID = (-2) + + +cdef cydriver.CUdevice get_device_from_ctx( + cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil + + cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index def486595c..ddb7683bc5 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -192,20 +192,23 @@ def precondition(checker: Callable[..., None], str what="") -> Callable: return outer -def get_device_from_ctx(ctx_handle) -> int: +cdef cydriver.CUdevice get_device_from_ctx( + cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil: """Get device ID from the given ctx.""" - from cuda.core.experimental._device import Device # avoid circular import - - prev_ctx = Device().context._handle - switch_context = int(ctx_handle) != int(prev_ctx) - if switch_context: - assert prev_ctx == handle_return(driver.cuCtxPopCurrent()) - handle_return(driver.cuCtxPushCurrent(ctx_handle)) - device_id = int(handle_return(driver.cuCtxGetDevice())) - if switch_context: - assert ctx_handle == handle_return(driver.cuCtxPopCurrent()) - handle_return(driver.cuCtxPushCurrent(prev_ctx)) - return device_id + cdef bint switch_context = (curr_ctx != target_ctx) + cdef cydriver.CUcontext ctx + cdef cydriver.CUdevice target_dev + with nogil: + if switch_context: + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) + assert curr_ctx == ctx + HANDLE_RETURN(cydriver.cuCtxPushCurrent(target_ctx)) + HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev)) + if switch_context: + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) + assert target_ctx == ctx + HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) + return target_dev def is_sequence(obj): From b713c5c4160324e401c0587e244f0f94097e182a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 3 Oct 2025 05:10:58 +0000 Subject: [PATCH 07/18] replace a few more __del__ by __dealloc__ --- cuda_core/cuda/core/experimental/_memory.pyx | 24 +++++++------------- cuda_core/cuda/core/experimental/_stream.pyx | 2 +- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 3fdc1410f7..5fb0457f93 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -9,7 +9,6 @@ from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, ) -import sys from dataclasses import dataclass from typing import Optional, TypeVar, Union, TYPE_CHECKING @@ -20,7 +19,6 @@ import cython import multiprocessing import os import platform -import sys import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream @@ -72,17 +70,8 @@ cdef class Buffer: self._mr = mr return self - def __del__(self): - self._shutdown_safe_close() - - cdef _shutdown_safe_close(self, stream: Stream = None, is_shutting_down=sys.is_finalizing): - if is_shutting_down and is_shutting_down(): - return - if self._ptr and self._mr is not None: - self._mr.deallocate(self._ptr, self._size, stream) - self._ptr = 0 - self._mr = None - self._ptr_obj = None + def __dealloc__(self): + self.close() def __reduce__(self): return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) @@ -99,7 +88,11 @@ cdef class Buffer: The stream object to use for asynchronous deallocation. If None, the behavior depends on the underlying memory resource. """ - self._shutdown_safe_close(stream, is_shutting_down=None) + if self._ptr and self._mr is not None: + self._mr.deallocate(self._ptr, self._size, stream) + self._ptr = 0 + self._mr = None + self._ptr_obj = None @property def handle(self) -> DevicePointerT: @@ -424,8 +417,7 @@ cdef class IPCAllocationHandle: self._handle = -1 self._uuid = None - def __del__(self): - """Close the handle.""" + def __dealloc__(self): self.close() def __int__(self) -> int: diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 20bdc27b7c..2a95edbd1a 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -194,7 +194,7 @@ cdef class Stream: self._device_id = device_id if device_id is not None else self._device_id return self - def __del__(self): + def __dealloc__(self): self.close() cpdef close(self): From d764f5bb78e712af2adad517a2b2fcebbd333d26 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 3 Oct 2025 05:32:55 +0000 Subject: [PATCH 08/18] improve __cuda_stream__ performance --- cuda_core/cuda/core/experimental/_stream.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 2a95edbd1a..25306c4525 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -214,7 +214,7 @@ cdef class Stream: def __cuda_stream__(self) -> tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" - return (0, int(self.handle)) + return (0, (self._handle)) @property def handle(self) -> cuda.bindings.driver.CUstream: From 3684644fb81d53adfeacff5bb668a03d8344b854 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 3 Oct 2025 18:12:00 +0000 Subject: [PATCH 09/18] cythonize Buffer & DMR (WIP - not working!) --- cuda_core/cuda/core/experimental/_memory.pyx | 236 +++++++++++-------- cuda_core/tests/memory_ipc/test_errors.py | 143 ----------- cuda_core/tests/memory_ipc/test_leaks.py | 7 +- cuda_core/tests/test_memory.py | 8 +- 4 files changed, 145 insertions(+), 249 deletions(-) delete mode 100644 cuda_core/tests/memory_ipc/test_errors.py diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 5fb0457f93..b64e77c83d 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -4,10 +4,16 @@ from __future__ import annotations -from libc.stdint cimport uintptr_t +cimport cpython +from libc.stdint cimport uintptr_t, intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver + from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, + HANDLE_RETURN, ) from dataclasses import dataclass @@ -53,7 +59,7 @@ cdef class Buffer: """ cdef: - uintptr_t _ptr + intptr_t _ptr size_t _size object _mr object _ptr_obj @@ -140,20 +146,23 @@ cdef class Buffer: """Export a buffer allocated for sharing between processes.""" if not self._mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") - err, ptr = driver.cuMemPoolExportPointer(self.handle) - raise_if_driver_error(err) - return IPCBufferDescriptor._init(ptr.reserved, self.size) + cdef cydriver.CUmemPoolPtrExportData data + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, (self._ptr))) + cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) + return IPCBufferDescriptor._init(data_b, self.size) @classmethod - def from_ipc_descriptor(cls, mr: MemoryResource, ipc_buffer: IPCBufferDescriptor) -> Buffer: + def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor) -> Buffer: """Import a buffer that was exported from another process.""" if not mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") - share_data = driver.CUmemPoolPtrExportData() + cdef cydriver.CUmemPoolPtrExportData share_data share_data.reserved = ipc_buffer._reserved - err, ptr = driver.cuMemPoolImportPointer(mr._mempool_handle, share_data) - raise_if_driver_error(err) - return Buffer.from_handle(ptr, ipc_buffer.size, mr) + cdef cydriver.CUdeviceptr ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &share_data)) + return Buffer.from_handle(ptr, ipc_buffer.size, mr) def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. @@ -360,10 +369,9 @@ class MemoryResource(abc.ABC): # IPC is currently only supported on Linux. On other platforms, the IPC handle # type is set equal to the no-IPC handle type. +cdef cydriver.CUmemAllocationHandleType _IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ + if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE -_NOIPC_HANDLE_TYPE = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE -_IPC_HANDLE_TYPE = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ - if platform.system() == "Linux" else _NOIPC_HANDLE_TYPE cdef class IPCBufferDescriptor: """Serializable object describing a buffer that can be shared between processes.""" @@ -466,6 +474,7 @@ cdef class DeviceMemoryResourceOptions: max_size : cython.int = 0 +# TODO: cythonize this? class DeviceMemoryResourceAttributes: def __init__(self, *args, **kwargs): raise RuntimeError("DeviceMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.") @@ -483,8 +492,9 @@ class DeviceMemoryResourceAttributes: def fget(self) -> property_type: mr = self._mr() if mr is None: - raise RuntimeError("DeviceMemoryResource is expired") - err, value = driver.cuMemPoolGetAttribute(mr._mempool_handle, attr_enum) + raise RuntimeError("DeviceMemoryResource is expired") + # TODO: this implementation does not allow lowering to Cython + nogil + err, value = driver.cuMemPoolGetAttribute(mr.handle, attr_enum) raise_if_driver_error(err) return property_type(value) return property(fget=fget, doc=stub.__doc__) @@ -531,7 +541,34 @@ class DeviceMemoryResourceAttributes: # and the serialized buffer descriptor. _ipc_registry = {} -class DeviceMemoryResource(MemoryResource): + +cdef class _DeviceMemoryResourceBase: + """Internal only. Responsible for offering C layout & attribute access.""" + cdef: + int _dev_id + cydriver.CUmemoryPool _mempool_handle + object _attributes + cydriver.CUmemAllocationHandleType _ipc_handle_type + bint _mempool_owned + bint _is_mapped + object _uuid + IPCAllocationHandle _alloc_handle + + def __cinit__(self): + self._dev_id = cydriver.CU_DEVICE_INVALID + self._mempool_handle = NULL + self._attributes = None + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX + self._mempool_owned = False + self._is_mapped = False + self._uuid = None + self._alloc_handle = None + + def __dealloc__(self): + pass + + +cdef class DeviceMemoryResource(_DeviceMemoryResourceBase, MemoryResource): """ Create a device memory resource managing a stream-ordered memory pool. @@ -609,97 +646,91 @@ class DeviceMemoryResource(MemoryResource): methods. The reconstruction procedure uses the registry to find the associated MMR. """ - __slots__ = ("_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", - "_mempool_owned", "_is_mapped", "_uuid", "_alloc_handle") + cdef: + dict __dict__ + object __weakref__ def __init__(self, device_id: int | Device, options=None): - device_id = getattr(device_id, 'device_id', device_id) + cdef int dev_id = getattr(device_id, 'device_id', device_id) opts = check_or_create_options( DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", keep_none=True ) + cdef cydriver.cuuint64_t current_threshold + cdef cydriver.cuuint64_t max_threshold = 0xFFFFFFFFFFFFFFFF + cdef cydriver.CUmemPoolProps properties if opts is None: # Get the current memory pool. - self._dev_id = device_id - self._mempool_handle = None - self._attributes = None - self._ipc_handle_type = _NOIPC_HANDLE_TYPE + self._dev_id = dev_id + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE self._mempool_owned = False - self._is_mapped = False - self._uuid = None - self._alloc_handle = None - err, self._mempool_handle = driver.cuDeviceGetMemPool(self.device_id) - raise_if_driver_error(err) + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id)) - # Set a higher release threshold to improve performance when there are no active allocations. - # By default, the release threshold is 0, which means memory is immediately released back - # to the OS when there are no active suballocations, causing performance issues. - # Check current release threshold - err, current_threshold = driver.cuMemPoolGetAttribute( - self._mempool_handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD - ) - raise_if_driver_error(err) - # If threshold is 0 (default), set it to maximum to retain memory in the pool - if int(current_threshold) == 0: - err, = driver.cuMemPoolSetAttribute( - self._mempool_handle, - driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - driver.cuuint64_t(0xFFFFFFFFFFFFFFFF), + # Set a higher release threshold to improve performance when there are no active allocations. + # By default, the release threshold is 0, which means memory is immediately released back + # to the OS when there are no active suballocations, causing performance issues. + # Check current release threshold + HANDLE_RETURN(cydriver.cuMemPoolGetAttribute( + self._mempool_handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold) ) - raise_if_driver_error(err) + + # If threshold is 0 (default), set it to maximum to retain memory in the pool + if current_threshold == 0: + HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( + self._mempool_handle, + cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + &max_threshold + )) else: # Create a new memory pool. - if opts.ipc_enabled and _IPC_HANDLE_TYPE == _NOIPC_HANDLE_TYPE: + if opts.ipc_enabled and _IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: raise RuntimeError("IPC is not available on {platform.system()}") - properties = driver.CUmemPoolProps() - properties.allocType = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED - properties.handleTypes = _IPC_HANDLE_TYPE if opts.ipc_enabled else _NOIPC_HANDLE_TYPE - properties.location = driver.CUmemLocation() - properties.location.id = device_id - properties.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) + properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + properties.handleTypes = _IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + properties.location.id = dev_id + properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE properties.maxSize = opts.max_size - properties.win32SecurityAttributes = 0 + properties.win32SecurityAttributes = NULL properties.usage = 0 - self._dev_id = device_id - self._mempool_handle = None - self._attributes = None + self._dev_id = dev_id self._ipc_handle_type = properties.handleTypes self._mempool_owned = True - self._is_mapped = False - self._uuid = None - self._alloc_handle = None - err, self._mempool_handle = driver.cuMemPoolCreate(properties) - raise_if_driver_error(err) + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties)) + # TODO: should we also set the threshold here? if opts.ipc_enabled: - self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid + self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid - def __del__(self): + def __dealloc__(self): self.close() def close(self): """Close the device memory resource and destroy the associated memory pool if owned.""" - if self._mempool_handle is not None: - try: - if self._mempool_owned: - err, = driver.cuMemPoolDestroy(self._mempool_handle) - raise_if_driver_error(err) - finally: - if self.is_mapped: - self.unregister() - self._dev_id = None - self._mempool_handle = None - self._attributes = None - self._ipc_handle_type = _NOIPC_HANDLE_TYPE - self._mempool_owned = False - self._is_mapped = False - self._uuid = None - self._alloc_handle = None + if self._mempool_handle == NULL: + return + try: + if self._mempool_owned: + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._mempool_handle)) + finally: + if self.is_mapped: + self.unregister() + self._dev_id = cydriver.CU_DEVICE_INVALID + self._mempool_handle = NULL + self._attributes = None + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX + self._mempool_owned = False + self._is_mapped = False + self._uuid = None + self._alloc_handle = None def __reduce__(self): return DeviceMemoryResource.from_registry, (self.uuid,) @@ -775,30 +806,33 @@ class DeviceMemoryResource(MemoryResource): """ # Quick exit for registry hits. uuid = getattr(alloc_handle, 'uuid', None) - self = _ipc_registry.get(uuid) - if self is not None: - return self + mr = _ipc_registry.get(uuid) + if mr is not None: + return mr device_id = getattr(device_id, 'device_id', device_id) - self = cls.__new__(cls) + cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) self._dev_id = device_id - self._mempool_handle = None + self._mempool_handle = NULL self._attributes = None self._ipc_handle_type = _IPC_HANDLE_TYPE self._mempool_owned = True self._is_mapped = True self._uuid = None - self._alloc_handle = None # only used for non-imported + self._alloc_handle = None # only used for non-imported - err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0) - raise_if_driver_error(err) + cdef int handle = int(alloc_handle) + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( + &(self._mempool_handle), &handle, _IPC_HANDLE_TYPE, 0) + ) if uuid is not None: registered = self.register(uuid) assert registered is self return self - def get_allocation_handle(self) -> IPCAllocationHandle: + cpdef IPCAllocationHandle get_allocation_handle(self): """Export the memory pool handle to be shared (requires IPC). The handle can be used to share the memory pool with other processes. @@ -808,13 +842,19 @@ class DeviceMemoryResource(MemoryResource): ------- The shareable handle for the memory pool. """ + # Note: This is Linux only (int for file descriptor) + cdef int alloc_handle + if self._alloc_handle is None: if not self.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") if self._is_mapped: raise RuntimeError("Imported memory resource cannot be exported") - err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0) - raise_if_driver_error(err) + + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( + &alloc_handle, self._mempool_handle, _IPC_HANDLE_TYPE, 0) + ) try: assert self._uuid is None import uuid @@ -846,9 +886,11 @@ class DeviceMemoryResource(MemoryResource): raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") if stream is None: stream = default_stream() - err, ptr = driver.cuMemAllocFromPoolAsync(size, self._mempool_handle, stream.handle) - raise_if_driver_error(err) - return Buffer._init(ptr, size, self) + cdef cydriver.CUstream s = (stream.handle) + cdef cydriver.CUdeviceptr devptr + with nogil: + HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s)) + return Buffer._init(devptr, size, self) def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): """Deallocate a buffer previously allocated by this resource. @@ -865,8 +907,10 @@ class DeviceMemoryResource(MemoryResource): """ if stream is None: stream = default_stream() - err, = driver.cuMemFreeAsync(ptr, stream.handle) - raise_if_driver_error(err) + cdef cydriver.CUstream s = (stream.handle) + cdef cydriver.CUdeviceptr devptr = ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) @property def attributes(self) -> DeviceMemoryResourceAttributes: @@ -881,9 +925,9 @@ class DeviceMemoryResource(MemoryResource): return self._dev_id @property - def handle(self) -> cuda.bindings.driver.CUmemoryPool: + def handle(self) -> driver.CUmemoryPool: """Handle to the underlying memory pool.""" - return self._mempool_handle + return driver.CUmemoryPool((self._mempool_handle)) @property def is_handle_owned(self) -> bool: @@ -911,7 +955,7 @@ class DeviceMemoryResource(MemoryResource): @property def is_ipc_enabled(self) -> bool: """Whether this memory resource has IPC enabled.""" - return self._ipc_handle_type != _NOIPC_HANDLE_TYPE + return self._ipc_handle_type != cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE def _deep_reduce_device_memory_resource(mr): diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py deleted file mode 100644 index 3e8265b39c..0000000000 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ /dev/null @@ -1,143 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import multiprocessing -import pickle -import re - -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions -from cuda.core.experimental._utils.cuda_utils import CUDAError - -CHILD_TIMEOUT_SEC = 20 -NBYTES = 64 -POOL_SIZE = 2097152 - - -class ChildErrorHarness: - """Test harness for checking errors in child processes. Subclasses override - PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples).""" - - def test_main(self, ipc_device, ipc_memory_resource): - """Parent process that checks child errors.""" - # Attach fixtures to this object for convenience. These can be accessed - # from PARENT_ACTION. - self.device = ipc_device - self.mr = ipc_memory_resource - - # Start a child process to generate error info. - pipe = [multiprocessing.Queue() for _ in range(2)] - process = multiprocessing.Process(target=self.child_main, args=(pipe, self.device, self.mr)) - process.start() - - # Interact. - self.PARENT_ACTION(pipe[0]) - - # Check the error. - exc_type, exc_msg = pipe[1].get(timeout=CHILD_TIMEOUT_SEC) - self.ASSERT(exc_type, exc_msg) - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - def child_main(self, pipe, device, mr): - """Child process that pushes IPC errors to a shared pipe for testing.""" - self.device = device - self.device.set_current() - self.mr = mr - try: - self.CHILD_ACTION(pipe[0]) - except Exception as e: - exc_info = type(e), str(e) - else: - exc_info = None, None - pipe[1].put(exc_info) - - -class TestAllocFromImportedMr(ChildErrorHarness): - """Error when attempting to allocate from an import memory resource.""" - - def PARENT_ACTION(self, queue): - queue.put(self.mr) - - def CHILD_ACTION(self, queue): - mr = queue.get(timeout=CHILD_TIMEOUT_SEC) - mr.allocate(NBYTES) - - def ASSERT(self, exc_type, exc_msg): - assert exc_type is TypeError - assert exc_msg == "Cannot allocate from a mapped IPC-enabled memory resource" - - -class TestImportWrongMR(ChildErrorHarness): - """Error when importing a buffer from the wrong memory resource.""" - - def PARENT_ACTION(self, queue): - options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) - mr2 = DeviceMemoryResource(self.device, options=options) - buffer = mr2.allocate(NBYTES) - queue.put([self.mr, buffer.get_ipc_descriptor()]) # Note: mr does not own this buffer - - def CHILD_ACTION(self, queue): - mr, buffer_desc = queue.get(timeout=CHILD_TIMEOUT_SEC) - Buffer.from_ipc_descriptor(mr, buffer_desc) - - def ASSERT(self, exc_type, exc_msg): - assert exc_type is CUDAError - assert "CUDA_ERROR_INVALID_VALUE" in exc_msg - - -class TestExportImportedMR(ChildErrorHarness): - """Error when exporting a memory resource that was imported.""" - - def PARENT_ACTION(self, queue): - queue.put(self.mr) - - def CHILD_ACTION(self, queue): - mr = queue.get(timeout=CHILD_TIMEOUT_SEC) - mr.get_allocation_handle() - - def ASSERT(self, exc_type, exc_msg): - assert exc_type is RuntimeError - assert exc_msg == "Imported memory resource cannot be exported" - - -class TestImportBuffer(ChildErrorHarness): - """Error when using a buffer as a buffer descriptor.""" - - def PARENT_ACTION(self, queue): - # Note: if the buffer is not attached to something to prolong its life, - # CUDA_ERROR_INVALID_CONTEXT is raised from Buffer.__del__ - self.buffer = self.mr.allocate(NBYTES) - queue.put(self.buffer) - - def CHILD_ACTION(self, queue): - buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) - Buffer.from_ipc_descriptor(self.mr, buffer) - - def ASSERT(self, exc_type, exc_msg): - assert exc_type is TypeError - assert exc_msg.startswith("Argument 'ipc_buffer' has incorrect type") - - -class TestDanglingBuffer(ChildErrorHarness): - """ - Error when importing a buffer object without registering its memory - resource. - """ - - def PARENT_ACTION(self, queue): - options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) - mr2 = DeviceMemoryResource(self.device, options=options) - self.buffer = mr2.allocate(NBYTES) - buffer_s = pickle.dumps(self.buffer) # noqa: S301 - queue.put(buffer_s) # Note: mr2 not sent - - def CHILD_ACTION(self, queue): - Device().set_current() - buffer_s = queue.get(timeout=CHILD_TIMEOUT_SEC) - pickle.loads(buffer_s) # noqa: S301 - - def ASSERT(self, exc_type, exc_msg): - assert exc_type is RuntimeError - assert re.match(r"Memory resource [a-z0-9-]+ was not found", exc_msg) diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index bfead7dd31..a4cf04b4b6 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -4,6 +4,7 @@ import contextlib import gc import multiprocessing as mp +import platform try: import psutil @@ -11,15 +12,15 @@ HAVE_PSUTIL = False else: HAVE_PSUTIL = True - import pytest -from cuda.core.experimental import _memory + from cuda.core.experimental._utils.cuda_utils import driver + CHILD_TIMEOUT_SEC = 20 NBYTES = 64 -USING_FDS = _memory._IPC_HANDLE_TYPE == driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR +USING_FDS = True if platform.system() == "Linux" else False skip_if_unrunnable = pytest.mark.skipif( not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable" ) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 5886433b22..9a76a23704 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -426,7 +426,7 @@ def test_mempool_attributes_ownership(mempool_device): device = mempool_device mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE)) attributes = mr.attributes - old_handle = mr._mempool_handle + old_handle = mr.handle mr.close() del mr @@ -440,12 +440,6 @@ def test_mempool_attributes_ownership(mempool_device): with pytest.raises(RuntimeError, match="DeviceMemoryResource is expired"): _ = attributes.used_mem_high - # Even if we stuff the original handle into a new class. - mr._mempool_handle, old_handle = old_handle, mr._mempool_handle - with pytest.raises(RuntimeError, match="DeviceMemoryResource is expired"): - _ = attributes.used_mem_high - mr._mempool_handle = old_handle - # Ensure that memory views dellocate their reference to dlpack tensors @pytest.mark.skipif(np is None, reason="numpy is not installed") From 2c9d5ec4b2d5cbcd950899fa358ffd526c4825f8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 3 Oct 2025 21:57:47 +0000 Subject: [PATCH 10/18] minor fixes - still failing --- cuda_core/cuda/core/experimental/_memory.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index b64e77c83d..c789bfb189 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -6,7 +6,7 @@ from __future__ import annotations cimport cpython from libc.stdint cimport uintptr_t, intptr_t -from libc.string cimport memset +from libc.string cimport memset, memcpy from cuda.bindings cimport cydriver @@ -70,7 +70,7 @@ cdef class Buffer: @classmethod def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None): cdef Buffer self = Buffer.__new__(cls) - self._ptr = (int(ptr)) + self._ptr = (int(ptr)) self._ptr_obj = ptr self._size = size self._mr = mr @@ -158,7 +158,7 @@ cdef class Buffer: if not mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") cdef cydriver.CUmemPoolPtrExportData share_data - share_data.reserved = ipc_buffer._reserved + memcpy(share_data.reserved, (ipc_buffer._reserved), sizeof(share_data.reserved)) cdef cydriver.CUdeviceptr ptr with nogil: HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &share_data)) @@ -711,7 +711,7 @@ cdef class DeviceMemoryResource(_DeviceMemoryResourceBase, MemoryResource): def __dealloc__(self): self.close() - def close(self): + cpdef close(self): """Close the device memory resource and destroy the associated memory pool if owned.""" if self._mempool_handle == NULL: return From 638bf59cc4e137b1efd7a4a7883b3aeab3bac1b2 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 3 Oct 2025 22:49:52 +0000 Subject: [PATCH 11/18] fix casting and avoid repetive assignment --- cuda_core/cuda/core/experimental/_memory.pyx | 7 +- cuda_core/tests/memory_ipc/test_errors.py | 143 +++++++++++++++++++ 2 files changed, 145 insertions(+), 5 deletions(-) create mode 100644 cuda_core/tests/memory_ipc/test_errors.py diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index c789bfb189..1753d80a3a 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -814,18 +814,15 @@ cdef class DeviceMemoryResource(_DeviceMemoryResourceBase, MemoryResource): cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) self._dev_id = device_id - self._mempool_handle = NULL - self._attributes = None self._ipc_handle_type = _IPC_HANDLE_TYPE self._mempool_owned = True self._is_mapped = True - self._uuid = None - self._alloc_handle = None # only used for non-imported + #self._alloc_handle = None # only used for non-imported cdef int handle = int(alloc_handle) with nogil: HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( - &(self._mempool_handle), &handle, _IPC_HANDLE_TYPE, 0) + &(self._mempool_handle), handle, _IPC_HANDLE_TYPE, 0) ) if uuid is not None: registered = self.register(uuid) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py new file mode 100644 index 0000000000..3e8265b39c --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -0,0 +1,143 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import multiprocessing +import pickle +import re + +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions +from cuda.core.experimental._utils.cuda_utils import CUDAError + +CHILD_TIMEOUT_SEC = 20 +NBYTES = 64 +POOL_SIZE = 2097152 + + +class ChildErrorHarness: + """Test harness for checking errors in child processes. Subclasses override + PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples).""" + + def test_main(self, ipc_device, ipc_memory_resource): + """Parent process that checks child errors.""" + # Attach fixtures to this object for convenience. These can be accessed + # from PARENT_ACTION. + self.device = ipc_device + self.mr = ipc_memory_resource + + # Start a child process to generate error info. + pipe = [multiprocessing.Queue() for _ in range(2)] + process = multiprocessing.Process(target=self.child_main, args=(pipe, self.device, self.mr)) + process.start() + + # Interact. + self.PARENT_ACTION(pipe[0]) + + # Check the error. + exc_type, exc_msg = pipe[1].get(timeout=CHILD_TIMEOUT_SEC) + self.ASSERT(exc_type, exc_msg) + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + def child_main(self, pipe, device, mr): + """Child process that pushes IPC errors to a shared pipe for testing.""" + self.device = device + self.device.set_current() + self.mr = mr + try: + self.CHILD_ACTION(pipe[0]) + except Exception as e: + exc_info = type(e), str(e) + else: + exc_info = None, None + pipe[1].put(exc_info) + + +class TestAllocFromImportedMr(ChildErrorHarness): + """Error when attempting to allocate from an import memory resource.""" + + def PARENT_ACTION(self, queue): + queue.put(self.mr) + + def CHILD_ACTION(self, queue): + mr = queue.get(timeout=CHILD_TIMEOUT_SEC) + mr.allocate(NBYTES) + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is TypeError + assert exc_msg == "Cannot allocate from a mapped IPC-enabled memory resource" + + +class TestImportWrongMR(ChildErrorHarness): + """Error when importing a buffer from the wrong memory resource.""" + + def PARENT_ACTION(self, queue): + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mr2 = DeviceMemoryResource(self.device, options=options) + buffer = mr2.allocate(NBYTES) + queue.put([self.mr, buffer.get_ipc_descriptor()]) # Note: mr does not own this buffer + + def CHILD_ACTION(self, queue): + mr, buffer_desc = queue.get(timeout=CHILD_TIMEOUT_SEC) + Buffer.from_ipc_descriptor(mr, buffer_desc) + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is CUDAError + assert "CUDA_ERROR_INVALID_VALUE" in exc_msg + + +class TestExportImportedMR(ChildErrorHarness): + """Error when exporting a memory resource that was imported.""" + + def PARENT_ACTION(self, queue): + queue.put(self.mr) + + def CHILD_ACTION(self, queue): + mr = queue.get(timeout=CHILD_TIMEOUT_SEC) + mr.get_allocation_handle() + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is RuntimeError + assert exc_msg == "Imported memory resource cannot be exported" + + +class TestImportBuffer(ChildErrorHarness): + """Error when using a buffer as a buffer descriptor.""" + + def PARENT_ACTION(self, queue): + # Note: if the buffer is not attached to something to prolong its life, + # CUDA_ERROR_INVALID_CONTEXT is raised from Buffer.__del__ + self.buffer = self.mr.allocate(NBYTES) + queue.put(self.buffer) + + def CHILD_ACTION(self, queue): + buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) + Buffer.from_ipc_descriptor(self.mr, buffer) + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is TypeError + assert exc_msg.startswith("Argument 'ipc_buffer' has incorrect type") + + +class TestDanglingBuffer(ChildErrorHarness): + """ + Error when importing a buffer object without registering its memory + resource. + """ + + def PARENT_ACTION(self, queue): + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mr2 = DeviceMemoryResource(self.device, options=options) + self.buffer = mr2.allocate(NBYTES) + buffer_s = pickle.dumps(self.buffer) # noqa: S301 + queue.put(buffer_s) # Note: mr2 not sent + + def CHILD_ACTION(self, queue): + Device().set_current() + buffer_s = queue.get(timeout=CHILD_TIMEOUT_SEC) + pickle.loads(buffer_s) # noqa: S301 + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is RuntimeError + assert re.match(r"Memory resource [a-z0-9-]+ was not found", exc_msg) From 0c64e8e28c486dfc0d1c897bd36f9ae4663af22a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 4 Oct 2025 02:31:56 +0000 Subject: [PATCH 12/18] ensure we have C access for DeviceMemoryResource --- cuda_core/cuda/core/experimental/_memory.pyx | 93 ++++++++++++-------- cuda_core/cuda/core/experimental/_stream.pxd | 16 ++++ cuda_core/cuda/core/experimental/_stream.pyx | 11 --- 3 files changed, 72 insertions(+), 48 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 1753d80a3a..801f5f8ff9 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -10,6 +10,7 @@ from libc.string cimport memset, memcpy from cuda.bindings cimport cydriver +from cuda.core.experimental._stream cimport Stream as cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, @@ -95,7 +96,10 @@ cdef class Buffer: the behavior depends on the underlying memory resource. """ if self._ptr and self._mr is not None: - self._mr.deallocate(self._ptr, self._size, stream) + if isinstance(self._mr, _cyMemoryResource): + (<_cyMemoryResource>(self._mr))._deallocate(self._ptr, self._size, stream) + else: + self._mr.deallocate(self._ptr, self._size, stream) self._ptr = 0 self._mr = None self._ptr_obj = None @@ -286,6 +290,17 @@ cdef class Buffer: return Buffer._init(ptr, size, mr=mr) +cdef class _cyMemoryResource: + """ + Internal only. Responsible for offering fast C method access. + """ + cdef Buffer _allocate(self, size_t size, cyStream stream): + raise NotImplementedError + + cdef int _deallocate(self, intptr_t ptr, size_t size, cyStream stream) except?-1: + raise NotImplementedError + + class MemoryResource(abc.ABC): """Abstract base class for memory resources that manage allocation and deallocation of buffers. @@ -542,33 +557,7 @@ class DeviceMemoryResourceAttributes: _ipc_registry = {} -cdef class _DeviceMemoryResourceBase: - """Internal only. Responsible for offering C layout & attribute access.""" - cdef: - int _dev_id - cydriver.CUmemoryPool _mempool_handle - object _attributes - cydriver.CUmemAllocationHandleType _ipc_handle_type - bint _mempool_owned - bint _is_mapped - object _uuid - IPCAllocationHandle _alloc_handle - - def __cinit__(self): - self._dev_id = cydriver.CU_DEVICE_INVALID - self._mempool_handle = NULL - self._attributes = None - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX - self._mempool_owned = False - self._is_mapped = False - self._uuid = None - self._alloc_handle = None - - def __dealloc__(self): - pass - - -cdef class DeviceMemoryResource(_DeviceMemoryResourceBase, MemoryResource): +cdef class DeviceMemoryResource(_cyMemoryResource, MemoryResource): """ Create a device memory resource managing a stream-ordered memory pool. @@ -647,9 +636,27 @@ cdef class DeviceMemoryResource(_DeviceMemoryResourceBase, MemoryResource): associated MMR. """ cdef: - dict __dict__ + int _dev_id + cydriver.CUmemoryPool _mempool_handle + object _attributes + cydriver.CUmemAllocationHandleType _ipc_handle_type + bint _mempool_owned + bint _is_mapped + object _uuid + IPCAllocationHandle _alloc_handle + dict __dict__ # TODO: check if we still need this object __weakref__ + def __cinit__(self): + self._dev_id = cydriver.CU_DEVICE_INVALID + self._mempool_handle = NULL + self._attributes = None + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX + self._mempool_owned = False + self._is_mapped = False + self._uuid = None + self._alloc_handle = None + def __init__(self, device_id: int | Device, options=None): cdef int dev_id = getattr(device_id, 'device_id', device_id) opts = check_or_create_options( @@ -862,6 +869,18 @@ cdef class DeviceMemoryResource(_DeviceMemoryResourceBase, MemoryResource): raise return self._alloc_handle + cdef Buffer _allocate(self, size_t size, cyStream stream): + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr + with nogil: + HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s)) + cdef Buffer buf = Buffer.__new__(Buffer) + buf._ptr = (devptr) + buf._ptr_obj = None + buf._size = size + buf._mr = self + return buf + def allocate(self, size_t size, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. @@ -883,11 +902,14 @@ cdef class DeviceMemoryResource(_DeviceMemoryResourceBase, MemoryResource): raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") if stream is None: stream = default_stream() - cdef cydriver.CUstream s = (stream.handle) - cdef cydriver.CUdeviceptr devptr + return self._allocate(size, stream) + + cdef int _deallocate(self, intptr_t ptr, size_t size, cyStream stream) except?-1: + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr = ptr with nogil: - HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s)) - return Buffer._init(devptr, size, self) + HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) + return 0 def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): """Deallocate a buffer previously allocated by this resource. @@ -904,10 +926,7 @@ cdef class DeviceMemoryResource(_DeviceMemoryResourceBase, MemoryResource): """ if stream is None: stream = default_stream() - cdef cydriver.CUstream s = (stream.handle) - cdef cydriver.CUdeviceptr devptr = ptr - with nogil: - HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) + self._deallocate(ptr, size, stream) @property def attributes(self) -> DeviceMemoryResourceAttributes: diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index 6b8a7f0f60..9001cc6dbb 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -6,3 +6,19 @@ from cuda.bindings cimport cydriver cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except* + + +cdef class Stream: + + cdef: + cydriver.CUstream _handle + object _owner + object _builtin + object _nonblocking + object _priority + cydriver.CUdevice _device_id + cydriver.CUcontext _ctx_handle + + cpdef close(self) + cdef int _get_context(self) except?-1 nogil + cdef int _get_device_and_context(self) except?-1 diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 25306c4525..9d93012a3b 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -108,18 +108,7 @@ cdef class Stream: New streams should instead be created through a :obj:`~_device.Device` object, or created directly through using an existing handle using Stream.from_handle(). - """ - - cdef: - cydriver.CUstream _handle - object _owner - object _builtin - object _nonblocking - object _priority - cydriver.CUdevice _device_id - cydriver.CUcontext _ctx_handle - def __cinit__(self): self._handle = (NULL) self._device_id = cydriver.CU_DEVICE_INVALID # delayed From 4c4bb1779894ec81e3901bd909b95820ff2bff22 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 4 Oct 2025 03:05:56 +0000 Subject: [PATCH 13/18] restore the contracts for now (the deallocation stream should be fixed) --- cuda_core/cuda/core/experimental/_memory.pyx | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 801f5f8ff9..1bfbe59acd 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -97,6 +97,10 @@ cdef class Buffer: """ if self._ptr and self._mr is not None: if isinstance(self._mr, _cyMemoryResource): + # FIXME + if stream is None: + stream = Stream.__new__(Stream) + ((stream))._handle = (0) (<_cyMemoryResource>(self._mr))._deallocate(self._ptr, self._size, stream) else: self._mr.deallocate(self._ptr, self._size, stream) @@ -113,7 +117,13 @@ cdef class Buffer: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Buffer.handle)``. """ - return self._ptr_obj + if self._ptr_obj is not None: + return self._ptr_obj + elif self._ptr: + return self._ptr + else: + # contract: Buffer is closed + return None @property def size(self) -> int: From c593ac9476dfae43120014210b80e14190af5be2 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 4 Oct 2025 03:29:48 +0000 Subject: [PATCH 14/18] fully cythonize Stream --- cuda_core/cuda/core/experimental/_stream.pxd | 6 ++--- cuda_core/cuda/core/experimental/_stream.pyx | 27 ++++++++------------ 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index 9001cc6dbb..2fe77e07b4 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -13,9 +13,9 @@ cdef class Stream: cdef: cydriver.CUstream _handle object _owner - object _builtin - object _nonblocking - object _priority + bint _builtin + int _nonblocking + int _priority cydriver.CUdevice _device_id cydriver.CUcontext _ctx_handle diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 9d93012a3b..85204306eb 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -4,7 +4,7 @@ from __future__ import annotations -from libc.stdint cimport uintptr_t +from libc.stdint cimport uintptr_t, INT32_MIN from cuda.bindings cimport cydriver @@ -111,6 +111,10 @@ cdef class Stream: """ def __cinit__(self): self._handle = (NULL) + self._owner = None + self._builtin = False + self._nonblocking = -1 # delayed + self._priority = INT32_MIN # delayed self._device_id = cydriver.CU_DEVICE_INVALID # delayed self._ctx_handle = CU_CONTEXT_INVALID # delayed @@ -124,27 +128,19 @@ cdef class Stream: def _legacy_default(cls): cdef Stream self = Stream.__new__(cls) self._handle = (cydriver.CU_STREAM_LEGACY) - self._owner = None self._builtin = True - self._nonblocking = None # delayed - self._priority = None # delayed return self @classmethod def _per_thread_default(cls): cdef Stream self = Stream.__new__(cls) self._handle = (cydriver.CU_STREAM_PER_THREAD) - self._owner = None self._builtin = True - self._nonblocking = None # delayed - self._priority = None # delayed return self @classmethod def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None): cdef Stream self = Stream.__new__(cls) - self._owner = None - self._builtin = False if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") @@ -152,8 +148,6 @@ cdef class Stream: self._handle = _try_to_get_stream_ptr(obj) # TODO: check if obj is created under the current context/device self._owner = obj - self._nonblocking = None # delayed - self._priority = None # delayed return self cdef StreamOptions opts = check_or_create_options(StreamOptions, options, "Stream options") @@ -177,9 +171,8 @@ cdef class Stream: with nogil: HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio)) self._handle = s - self._owner = None - self._nonblocking = nonblocking - self._priority = priority + self._nonblocking = int(nonblocking) + self._priority = prio self._device_id = device_id if device_id is not None else self._device_id return self @@ -220,20 +213,20 @@ cdef class Stream: def is_nonblocking(self) -> bool: """Return True if this is a nonblocking stream, otherwise False.""" cdef unsigned int flags - if self._nonblocking is None: + if self._nonblocking == -1: with nogil: HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING: self._nonblocking = True else: self._nonblocking = False - return self._nonblocking + return bool(self._nonblocking) @property def priority(self) -> int: """Return the stream priority.""" cdef int prio - if self._priority is None: + if self._priority == INT32_MIN: with nogil: HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) self._priority = prio From fae5e7f191ec53fef460cc485a7438a4abf77a64 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 4 Oct 2025 03:37:09 +0000 Subject: [PATCH 15/18] make linter happy --- cuda_core/tests/memory_ipc/test_leaks.py | 5 +---- cuda_core/tests/test_memory.py | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index a4cf04b4b6..756b2f8104 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -14,13 +14,10 @@ HAVE_PSUTIL = True import pytest -from cuda.core.experimental._utils.cuda_utils import driver - - CHILD_TIMEOUT_SEC = 20 NBYTES = 64 -USING_FDS = True if platform.system() == "Linux" else False +USING_FDS = platform.system() == "Linux" skip_if_unrunnable = pytest.mark.skipif( not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable" ) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9a76a23704..9061dc270e 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -426,7 +426,6 @@ def test_mempool_attributes_ownership(mempool_device): device = mempool_device mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE)) attributes = mr.attributes - old_handle = mr.handle mr.close() del mr @@ -436,7 +435,7 @@ def test_mempool_attributes_ownership(mempool_device): # Even when a new object is created (we found a case where the same # mempool handle was really reused). - mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE)) + mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE)) # noqa: F841 with pytest.raises(RuntimeError, match="DeviceMemoryResource is expired"): _ = attributes.used_mem_high From 2dd91380a59fa8fe8b8cd5d875fe1e497c40bf24 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 6 Oct 2025 21:31:45 +0000 Subject: [PATCH 16/18] address review comments --- cuda_core/cuda/core/experimental/_memory.pyx | 9 ++++++--- cuda_core/cuda/core/experimental/_stream.pyx | 8 ++++---- cuda_core/docs/source/release/0.X.Y-notes.rst | 1 - cuda_core/examples/memory_ops.py | 6 +++--- cuda_core/tests/test_launcher.py | 2 +- cuda_core/tests/test_memory.py | 2 +- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 1bfbe59acd..80088eeabe 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -5,6 +5,7 @@ from __future__ import annotations cimport cpython +from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t, intptr_t from libc.string cimport memset, memcpy @@ -95,13 +96,15 @@ cdef class Buffer: The stream object to use for asynchronous deallocation. If None, the behavior depends on the underlying memory resource. """ + cdef _cyMemoryResource cy_mr if self._ptr and self._mr is not None: if isinstance(self._mr, _cyMemoryResource): # FIXME if stream is None: stream = Stream.__new__(Stream) ((stream))._handle = (0) - (<_cyMemoryResource>(self._mr))._deallocate(self._ptr, self._size, stream) + cy_mr = <_cyMemoryResource>(self._mr) + cy_mr._deallocate(self._ptr, self._size, stream) else: self._mr.deallocate(self._ptr, self._size, stream) self._ptr = 0 @@ -123,7 +126,7 @@ cdef class Buffer: return self._ptr else: # contract: Buffer is closed - return None + return 0 @property def size(self) -> int: @@ -673,7 +676,7 @@ cdef class DeviceMemoryResource(_cyMemoryResource, MemoryResource): DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", keep_none=True ) cdef cydriver.cuuint64_t current_threshold - cdef cydriver.cuuint64_t max_threshold = 0xFFFFFFFFFFFFFFFF + cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX cdef cydriver.CUmemPoolProps properties if opts is None: diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 85204306eb..280380e53a 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -113,10 +113,10 @@ cdef class Stream: self._handle = (NULL) self._owner = None self._builtin = False - self._nonblocking = -1 # delayed - self._priority = INT32_MIN # delayed - self._device_id = cydriver.CU_DEVICE_INVALID # delayed - self._ctx_handle = CU_CONTEXT_INVALID # delayed + self._nonblocking = -1 # lazy init'd + self._priority = INT32_MIN # lazy init'd + self._device_id = cydriver.CU_DEVICE_INVALID # lazy init'd + self._ctx_handle = CU_CONTEXT_INVALID # lazy init'd def __init__(self, *args, **kwargs): raise RuntimeError( diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index e87cbdee31..4c967248e9 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -22,7 +22,6 @@ Breaking Changes - **CUDA 11 support dropped**: CUDA 11 support is no longer tested and it may or may not work with cuda.bindings and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x. - Support for ``cuda-bindings`` (and ``cuda-python``) < 12.6.2 is dropped. Internally, ``cuda.core`` now always requires the `new binding module layout `_. As per the ``cuda-bindings`` `support policy `_), CUDA 12 users are encouraged to use the latest ``cuda-bindings`` 12.9.x, which is backward-compatible with all CUDA Toolkit 12.y. - **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``. -- When :class:`Buffer` is closed, :attr:`Buffer.handle` is now set to ``None``. It was previously set to ``0`` by accident. New features diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py index 391a183bb7..c4abd06e2c 100644 --- a/cuda_core/examples/memory_ops.py +++ b/cuda_core/examples/memory_ops.py @@ -128,8 +128,8 @@ cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream # Verify buffers are properly closed -assert device_buffer.handle is None, "Device buffer should be closed" -assert pinned_buffer.handle is None, "Pinned buffer should be closed" -assert new_device_buffer.handle is None, "New device buffer should be closed" +assert device_buffer.handle == 0, "Device buffer should be closed" +assert pinned_buffer.handle == 0, "Pinned buffer should be closed" +assert new_device_buffer.handle == 0, "New device buffer should be closed" print("Memory management example completed!") diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index cf48661f41..2698ccdc9d 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -370,4 +370,4 @@ def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_reso cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream # Verify buffer is properly closed - assert buffer.handle is None, f"{name} buffer should be closed" + assert buffer.handle == 0, f"{name} buffer should be closed" diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9061dc270e..26cd2a1393 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -223,7 +223,7 @@ def test_buffer_copy_from(): def buffer_close(dummy_mr: MemoryResource): buffer = dummy_mr.allocate(size=1024) buffer.close() - assert buffer.handle is None + assert buffer.handle == 0 assert buffer.memory_resource is None From 3809a3379b01ebbf3fd970066c398735c425f3ae Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 7 Oct 2025 14:58:25 +0000 Subject: [PATCH 17/18] give our ABC class (MemoryResource) fast access in C/Cython --- cuda_core/cuda/core/experimental/_memory.pyx | 132 +++++++++---------- 1 file changed, 64 insertions(+), 68 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 80088eeabe..39afa67234 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -39,9 +39,6 @@ if TYPE_CHECKING: from ._device import Device import uuid -# TODO: define a memory property mixin class and make Buffer and -# MemoryResource both inherit from it - PyCapsule = TypeVar("PyCapsule") """Represent the capsule type.""" @@ -50,7 +47,56 @@ DevicePointerT = Union[driver.CUdeviceptr, int, None] """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" -cdef class Buffer: +cdef class _cyBuffer: + """ + Internal only. Responsible for offering fast C method access. + """ + cdef: + intptr_t _ptr + size_t _size + _cyMemoryResource _mr + object _ptr_obj + + +cdef class _cyMemoryResource: + """ + Internal only. Responsible for offering fast C method access. + """ + cdef Buffer _allocate(self, size_t size, cyStream stream): + raise NotImplementedError + + cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: + raise NotImplementedError + + +class MemoryResourceAttributes(abc.ABC): + + @property + @abc.abstractmethod + def is_device_accessible(self) -> bool: + """bool: True if buffers allocated by this resource can be accessed on the device.""" + ... + + @property + @abc.abstractmethod + def is_host_accessible(self) -> bool: + """bool: True if buffers allocated by this resource can be accessed on the host.""" + ... + + @property + @abc.abstractmethod + def device_id(self) -> int: + """int: The device ordinal for which this memory resource is responsible. + + Raises + ------ + RuntimeError + If the resource is not bound to a specific device. + """ + ... + + +cdef class Buffer(_cyBuffer, MemoryResourceAttributes): """Represent a handle to allocated memory. This generic object provides a unified representation for how @@ -59,12 +105,7 @@ cdef class Buffer: Support for data interchange mechanisms are provided by DLPack. """ - - cdef: - intptr_t _ptr - size_t _size - object _mr - object _ptr_obj + cdef dict __dict__ # required if inheriting from both Cython/Python classes def __init__(self, *args, **kwargs): raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.") @@ -96,17 +137,12 @@ cdef class Buffer: The stream object to use for asynchronous deallocation. If None, the behavior depends on the underlying memory resource. """ - cdef _cyMemoryResource cy_mr if self._ptr and self._mr is not None: - if isinstance(self._mr, _cyMemoryResource): - # FIXME - if stream is None: - stream = Stream.__new__(Stream) - ((stream))._handle = (0) - cy_mr = <_cyMemoryResource>(self._mr) - cy_mr._deallocate(self._ptr, self._size, stream) - else: - self._mr.deallocate(self._ptr, self._size, stream) + # To be fixed in NVIDIA/cuda-python#1032 + if stream is None: + stream = Stream.__new__(Stream) + ((stream))._handle = (0) + self._mr._deallocate(self._ptr, self._size, stream) self._ptr = 0 self._mr = None self._ptr_obj = None @@ -303,18 +339,7 @@ cdef class Buffer: return Buffer._init(ptr, size, mr=mr) -cdef class _cyMemoryResource: - """ - Internal only. Responsible for offering fast C method access. - """ - cdef Buffer _allocate(self, size_t size, cyStream stream): - raise NotImplementedError - - cdef int _deallocate(self, intptr_t ptr, size_t size, cyStream stream) except?-1: - raise NotImplementedError - - -class MemoryResource(abc.ABC): +cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC): """Abstract base class for memory resources that manage allocation and deallocation of buffers. Subclasses must implement methods for allocating and deallocation, as well as properties @@ -323,14 +348,10 @@ class MemoryResource(abc.ABC): hold a reference to self, the buffer properties are retrieved simply by looking up the underlying memory resource's respective property.) """ + cdef dict __dict__ # required if inheriting from both Cython/Python classes - @abc.abstractmethod - def __init__(self, *args, **kwargs): - """Initialize the memory resource. - - Subclasses may use additional arguments to configure the resource. - """ - ... + cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: + self.deallocate(ptr, size, stream) @abc.abstractmethod def allocate(self, size_t size, stream: Stream = None) -> Buffer: @@ -370,30 +391,6 @@ class MemoryResource(abc.ABC): """ ... - @property - @abc.abstractmethod - def is_device_accessible(self) -> bool: - """bool: True if buffers allocated by this resource can be accessed on the device.""" - ... - - @property - @abc.abstractmethod - def is_host_accessible(self) -> bool: - """bool: True if buffers allocated by this resource can be accessed on the host.""" - ... - - @property - @abc.abstractmethod - def device_id(self) -> int: - """int: The device ordinal for which this memory resource is responsible. - - Raises - ------ - RuntimeError - If the resource is not bound to a specific device. - """ - ... - # IPC is currently only supported on Linux. On other platforms, the IPC handle # type is set equal to the no-IPC handle type. @@ -570,7 +567,7 @@ class DeviceMemoryResourceAttributes: _ipc_registry = {} -cdef class DeviceMemoryResource(_cyMemoryResource, MemoryResource): +cdef class DeviceMemoryResource(MemoryResource): """ Create a device memory resource managing a stream-ordered memory pool. @@ -657,7 +654,7 @@ cdef class DeviceMemoryResource(_cyMemoryResource, MemoryResource): bint _is_mapped object _uuid IPCAllocationHandle _alloc_handle - dict __dict__ # TODO: check if we still need this + dict __dict__ # required if inheriting from both Cython/Python classes object __weakref__ def __cinit__(self): @@ -917,14 +914,13 @@ cdef class DeviceMemoryResource(_cyMemoryResource, MemoryResource): stream = default_stream() return self._allocate(size, stream) - cdef int _deallocate(self, intptr_t ptr, size_t size, cyStream stream) except?-1: + cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr with nogil: HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) - return 0 - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + cpdef deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): """Deallocate a buffer previously allocated by this resource. Parameters From 22b0e2e526d68ca76fa77adc7ea97392a33a7586 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 7 Oct 2025 16:27:02 +0000 Subject: [PATCH 18/18] make it clear we need two CUDA contexts for retriving the stream's device --- cuda_core/cuda/core/experimental/_stream.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 280380e53a..c17af4ce46 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -327,12 +327,12 @@ cdef class Stream: cdef int _get_device_and_context(self) except?-1: cdef cydriver.CUcontext curr_ctx if self._device_id == cydriver.CU_DEVICE_INVALID: - # TODO: It is likely faster/safer to call cuCtxGetCurrent? - from cuda.core.experimental._device import Device # avoid circular import - curr_ctx = (Device().context._handle) with nogil: - # Get the stream context first + # Get the current context + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&curr_ctx)) + # Get the stream's context (self.ctx_handle is populated) self._get_context() + # Get the stream's device (may require a context-switching dance) self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx) return 0