-
Notifications
You must be signed in to change notification settings - Fork 62
perf: speed up kernel launch #510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5b87524
77c311c
bd19d73
0d68a22
a1b6a1e
9e3a39b
f1105f0
54d34f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -199,10 +199,11 @@ def _numba_type_(self): | |
| @property | ||
| def device_ctypes_pointer(self): | ||
| """Returns the ctypes pointer to the GPU data buffer""" | ||
| if self.gpu_data is None: | ||
| return c_void_p(0) | ||
| else: | ||
| try: | ||
| # apparently faster in the non-exceptional case | ||
| return self.gpu_data.device_ctypes_pointer | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is faster because after Python 3.11, there's no cost to the non-exceptional case, and this case appears to be more common in the kernel launching path than the case of |
||
| except AttributeError: | ||
| return c_void_p(0) | ||
|
|
||
| @devices.require_context | ||
| def copy_to_device(self, ary, stream=0): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,23 +21,14 @@ | |
| from .driver import driver | ||
|
|
||
|
|
||
| class _DeviceList(object): | ||
| def __getattr__(self, attr): | ||
| # First time looking at "lst" attribute. | ||
| if attr == "lst": | ||
| # Device list is not initialized. | ||
| # Query all CUDA devices. | ||
| numdev = driver.get_device_count() | ||
| gpus = [ | ||
| _DeviceContextManager(driver.get_device(devid)) | ||
| for devid in range(numdev) | ||
| ] | ||
| # Define "lst" to avoid re-initialization | ||
| self.lst = gpus | ||
| return gpus | ||
|
|
||
| # Other attributes | ||
| return super(_DeviceList, self).__getattr__(attr) | ||
| class _DeviceList: | ||
| @property | ||
| @functools.cache | ||
| def lst(self): | ||
| return [ | ||
| _DeviceContextManager(driver.get_device(devid)) | ||
| for devid in range(driver.get_device_count()) | ||
| ] | ||
|
|
||
| def __getitem__(self, devnum): | ||
| """ | ||
|
|
@@ -79,6 +70,9 @@ class _DeviceContextManager(object): | |
|
|
||
| def __init__(self, device): | ||
| self._device = device | ||
| # Forwarded directly, to avoid the performance overhead of | ||
| # `__getattr__` and method lookup for a commonly accessed method | ||
| self.get_primary_context = self._device.get_primary_context | ||
|
|
||
| def __getattr__(self, item): | ||
| return getattr(self._device, item) | ||
|
|
@@ -88,10 +82,10 @@ def __enter__(self): | |
|
|
||
| def __exit__(self, exc_type, exc_val, exc_tb): | ||
| # this will verify that we are popping the right device context. | ||
| self._device.get_primary_context().pop() | ||
| self.get_primary_context().pop() | ||
|
|
||
| def __str__(self): | ||
| return "<Managed Device {self.id}>".format(self=self) | ||
| return f"<Managed Device {self.id}>" | ||
|
|
||
|
|
||
| class _Runtime(object): | ||
|
|
@@ -147,7 +141,8 @@ def get_or_create_context(self, devnum): | |
| return attached_ctx | ||
| else: | ||
| devnum = int(devnum) | ||
| return self._activate_context_for(devnum) | ||
| with self._lock: | ||
| return self._activate_context_for(devnum) | ||
|
|
||
| def _get_or_create_context_uncached(self, devnum): | ||
| """See also ``get_or_create_context(devnum)``. | ||
|
|
@@ -166,28 +161,29 @@ def _get_or_create_context_uncached(self, devnum): | |
| ctx_handle = ctx.handle.value | ||
| ac_ctx_handle = ac.context_handle.value | ||
| if ctx_handle != ac_ctx_handle: | ||
| msg = ( | ||
| raise RuntimeError( | ||
| "Numba cannot operate on non-primary" | ||
| " CUDA context {:x}" | ||
| f" CUDA context {ac_ctx_handle:x}" | ||
| ) | ||
| raise RuntimeError(msg.format(ac_ctx_handle)) | ||
| # Ensure the context is ready | ||
| ctx.prepare_for_use() | ||
| return ctx | ||
|
|
||
| def _activate_context_for(self, devnum): | ||
| with self._lock: | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This lock (while reentrant, so not incorrect) was held inside of another section, so I just inlined the lock-holding to the one place this method was being called without a lock, and was able to remove one acquire and release operation. |
||
| gpu = self.gpus[devnum] | ||
| newctx = gpu.get_primary_context() | ||
| # Detect unexpected context switch | ||
| cached_ctx = self._get_attached_context() | ||
| if cached_ctx is not None and cached_ctx is not newctx: | ||
| raise RuntimeError("Cannot switch CUDA-context.") | ||
| newctx.push() | ||
| return newctx | ||
| gpu = self.gpus[devnum] | ||
| newctx = gpu.get_primary_context() | ||
| # Detect unexpected context switch | ||
| cached_ctx = self._get_attached_context() | ||
| if cached_ctx is not None and cached_ctx is not newctx: | ||
| raise RuntimeError("Cannot switch CUDA-context.") | ||
| newctx.push() | ||
| return newctx | ||
|
|
||
| def _get_attached_context(self): | ||
| return getattr(self._tls, "attached_context", None) | ||
| try: | ||
| return self._tls.attached_context | ||
| except AttributeError: | ||
| return None | ||
|
|
||
| def _set_attached_context(self, ctx): | ||
| self._tls.attached_context = ctx | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.