diff --git a/numba_cuda/numba/cuda/args.py b/numba_cuda/numba/cuda/args.py
index 755e19d00..9adecfde9 100644
--- a/numba_cuda/numba/cuda/args.py
+++ b/numba_cuda/numba/cuda/args.py
@@ -6,16 +6,13 @@
 memory transfers before & after the kernel call.
 """
 
-import abc
-
 from numba.cuda.typing.typeof import typeof, Purpose
 
 
-class ArgHint(metaclass=abc.ABCMeta):
+class ArgHint:
     def __init__(self, value):
         self.value = value
 
-    @abc.abstractmethod
     def to_device(self, retr, stream=0):
         """
         :param stream: a stream to use when copying data
@@ -25,7 +22,6 @@ def to_device(self, retr, stream=0):
         :return: a value (usually an `DeviceNDArray`) to be passed to
             the kernel
         """
-        pass
 
     @property
     def _numba_type_(self):
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
index e8dc27f3b..900aac6bd 100644
--- a/numba_cuda/numba/cuda/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -199,10 +199,11 @@ def _numba_type_(self):
     @property
     def device_ctypes_pointer(self):
         """Returns the ctypes pointer to the GPU data buffer"""
-        if self.gpu_data is None:
-            return c_void_p(0)
-        else:
+        try:
+            # apparently faster in the non-exceptional case
             return self.gpu_data.device_ctypes_pointer
+        except AttributeError:
+            return c_void_p(0)
 
     @devices.require_context
     def copy_to_device(self, ary, stream=0):
diff --git a/numba_cuda/numba/cuda/cudadrv/devices.py b/numba_cuda/numba/cuda/cudadrv/devices.py
index c2e678421..514191384 100644
--- a/numba_cuda/numba/cuda/cudadrv/devices.py
+++ b/numba_cuda/numba/cuda/cudadrv/devices.py
@@ -21,23 +21,14 @@
 from .driver import driver
 
 
-class _DeviceList(object):
-    def __getattr__(self, attr):
-        # First time looking at "lst" attribute.
-        if attr == "lst":
-            # Device list is not initialized.
-            # Query all CUDA devices.
-            numdev = driver.get_device_count()
-            gpus = [
-                _DeviceContextManager(driver.get_device(devid))
-                for devid in range(numdev)
-            ]
-            # Define "lst" to avoid re-initialization
-            self.lst = gpus
-            return gpus
-
-        # Other attributes
-        return super(_DeviceList, self).__getattr__(attr)
+class _DeviceList:
+    @property
+    @functools.cache
+    def lst(self):
+        return [
+            _DeviceContextManager(driver.get_device(devid))
+            for devid in range(driver.get_device_count())
+        ]
 
     def __getitem__(self, devnum):
         """
@@ -79,6 +70,9 @@ class _DeviceContextManager(object):
 
     def __init__(self, device):
         self._device = device
+        # Forwarded directly, to avoid the performance overhead of
+        # `__getattr__` and method lookup for a commonly accessed method
+        self.get_primary_context = self._device.get_primary_context
 
     def __getattr__(self, item):
         return getattr(self._device, item)
@@ -88,10 +82,10 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         # this will verify that we are popping the right device context.
-        self._device.get_primary_context().pop()
+        self.get_primary_context().pop()
 
     def __str__(self):
-        return "<Managed Device {self.id}>".format(self=self)
+        return f"<Managed Device {self.id}>"
 
 
 class _Runtime(object):
@@ -147,7 +141,8 @@ def get_or_create_context(self, devnum):
                 return attached_ctx
         else:
             devnum = int(devnum)
-            return self._activate_context_for(devnum)
+            with self._lock:
+                return self._activate_context_for(devnum)
 
     def _get_or_create_context_uncached(self, devnum):
         """See also ``get_or_create_context(devnum)``.
@@ -166,28 +161,29 @@ def _get_or_create_context_uncached(self, devnum):
                     ctx_handle = ctx.handle.value
                     ac_ctx_handle = ac.context_handle.value
                     if ctx_handle != ac_ctx_handle:
-                        msg = (
+                        raise RuntimeError(
                             "Numba cannot operate on non-primary"
-                            " CUDA context {:x}"
+                            f" CUDA context {ac_ctx_handle:x}"
                         )
-                        raise RuntimeError(msg.format(ac_ctx_handle))
                     # Ensure the context is ready
                     ctx.prepare_for_use()
                 return ctx
 
     def _activate_context_for(self, devnum):
-        with self._lock:
-            gpu = self.gpus[devnum]
-            newctx = gpu.get_primary_context()
-            # Detect unexpected context switch
-            cached_ctx = self._get_attached_context()
-            if cached_ctx is not None and cached_ctx is not newctx:
-                raise RuntimeError("Cannot switch CUDA-context.")
-            newctx.push()
-            return newctx
+        gpu = self.gpus[devnum]
+        newctx = gpu.get_primary_context()
+        # Detect unexpected context switch
+        cached_ctx = self._get_attached_context()
+        if cached_ctx is not None and cached_ctx is not newctx:
+            raise RuntimeError("Cannot switch CUDA-context.")
+        newctx.push()
+        return newctx
 
     def _get_attached_context(self):
-        return getattr(self._tls, "attached_context", None)
+        try:
+            return self._tls.attached_context
+        except AttributeError:
+            return None
 
     def _set_attached_context(self, ctx):
         self._tls.attached_context = ctx
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 69541df47..8bdd4f2af 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -470,10 +470,11 @@ class _ActiveContext(object):
     def __enter__(self):
         is_top = False
         # check TLS cache
-        if hasattr(self._tls_cache, "ctx_devnum"):
-            hctx, devnum = self._tls_cache.ctx_devnum
-        # Not cached. Query the driver API.
-        else:
+        cache = self._tls_cache
+        try:
+            hctx, devnum = cache.ctx_devnum
+        except AttributeError:
+            # Not cached. Query the driver API.
             hctx = driver.cuCtxGetCurrent()
             if int(hctx) == 0:
                 hctx = None
@@ -495,7 +496,7 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self._is_top:
-            delattr(self._tls_cache, "ctx_devnum")
+            del self._tls_cache.ctx_devnum
 
     def __bool__(self):
         """Returns True is there's a valid and active CUDA context."""
@@ -2061,6 +2062,10 @@ def deref():
         self._mem.refct += 1
         weakref.finalize(self, deref)
 
+        # pull this attribute out for speed, because it's used often and
+        # there's overhead to going through `__getattr__`
+        self.device_ctypes_pointer = self._view.device_ctypes_pointer
+
     def __getattr__(self, fname):
         """Proxy MemoryPointer methods"""
         return getattr(self._view, fname)
@@ -3072,7 +3077,11 @@ def is_device_memory(obj):
     "device_pointer" which value is an int object carrying the pointer
     value of the device memory address.  This is not tested in this method.
     """
-    return getattr(obj, "__cuda_memory__", False)
+    try:
+        # This is cheaper than getattr in the non-exceptional case
+        return obj.__cuda_memory__
+    except AttributeError:
+        return False
 
 
 def require_device_memory(obj):
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index d9da06203..617562379 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -558,10 +558,8 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
             kernelargs.append(nitems)
             kernelargs.append(itemsize)
             kernelargs.append(data)
-            for ax in range(devary.ndim):
-                kernelargs.append(c_intp(devary.shape[ax]))
-            for ax in range(devary.ndim):
-                kernelargs.append(c_intp(devary.strides[ax]))
+            kernelargs.extend(map(c_intp, devary.shape))
+            kernelargs.extend(map(c_intp, devary.strides))
 
         elif isinstance(ty, types.CPointer):
             # Pointer arguments should be a pointer-sized integer
diff --git a/numba_cuda/numba/cuda/simulator/kernel.py b/numba_cuda/numba/cuda/simulator/kernel.py
index 36cf20a2a..a654cb77b 100644
--- a/numba_cuda/numba/cuda/simulator/kernel.py
+++ b/numba_cuda/numba/cuda/simulator/kernel.py
@@ -11,7 +11,7 @@
 from .cudadrv.devicearray import FakeCUDAArray, FakeWithinKernelCUDAArray
 from .kernelapi import Dim3, FakeCUDAModule, swapped_cuda_module
 from ..errors import normalize_kernel_dimensions
-from ..args import wrap_arg, ArgHint
+from ..args import ArgHint, InOut
 
 
 """
@@ -109,7 +109,7 @@ def fake_arg(arg):
                 )
 
                 if isinstance(arg, np.ndarray) and arg.ndim > 0:
-                    ret = wrap_arg(arg).to_device(retr)
+                    ret = InOut(arg).to_device(retr)
                 elif isinstance(arg, ArgHint):
                     ret = arg.to_device(retr)
                 elif isinstance(arg, np.void):