NVIDIA · gmarkall · Jul 2, 2025 · Jun 12, 2025 · Jul 1, 2025 · Jul 1, 2025
diff --git a/docs/source/reference/host.rst b/docs/source/reference/host.rst
@@ -229,20 +229,3 @@ stream, and the stream must remain valid whilst the Numba ``Stream`` object is
 in use.
 
 .. autofunction:: numba.cuda.external_stream
-
-
-Runtime
--------
-
-Numba generally uses the Driver API, but it provides a simple wrapper to the
-Runtime API so that the version of the runtime in use can be queried. This is
-accessed through ``cuda.runtime``, which is an instance of the
-:class:`numba.cuda.cudadrv.runtime.Runtime` class:
-
-.. autoclass:: numba.cuda.cudadrv.runtime.Runtime
-   :members: get_version, is_supported_version, supported_versions
-
-Whether the current runtime is officially supported and tested with the current
-version of Numba can also be queried:
-
-.. autofunction:: numba.cuda.is_supported_version
diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
@@ -83,6 +83,19 @@
 implementation = "NVIDIA"
 
 
+# The default compute capability as set by the upstream Numba implementation.
+config_default_cc = config.CUDA_DEFAULT_PTX_CC
+
+# The default compute capability for Numba-CUDA. This will usually override the
+# upstream Numba built-in default of 5.0, unless the user has set it even
+# higher, in which case we should use the user-specified value. This default is
+# aligned with recent toolkit versions.
+numba_cuda_default_ptx_cc = (7, 5)
+
+if numba_cuda_default_ptx_cc > config_default_cc:
+    config.CUDA_DEFAULT_PTX_CC = numba_cuda_default_ptx_cc
+
+
 def test(*args, **kwargs):
     if not is_available():
         raise cuda_error()

diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
@@ -2,7 +2,7 @@
 
 from numba.core import config, serialize
 from numba.core.codegen import Codegen, CodeLibrary
-from .cudadrv import devices, driver, nvvm, runtime
+from .cudadrv import devices, driver, nvrtc, nvvm, runtime
 from numba.cuda.cudadrv.libs import get_cudalib
 from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.memory_management.nrt import NRT_LIBRARY
@@ -211,7 +211,7 @@ def get_asm_str(self, cc=None):
         if ptxes:
             return ptxes
 
-        arch = nvvm.get_arch_option(*cc)
+        arch = nvrtc.get_arch_option(*cc)
         options = self._nvvm_options.copy()
         options["arch"] = arch
 
@@ -240,7 +240,7 @@ def get_ltoir(self, cc=None):
         if ltoir is not None:
             return ltoir
 
-        arch = nvvm.get_arch_option(*cc)
+        arch = nvrtc.get_arch_option(*cc)
         options = self._nvvm_options.copy()
         options["arch"] = arch
         options["gen-lto"] = None

diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
@@ -35,7 +35,7 @@
 from numba.cuda import nvvmutils
 from numba.cuda.api import get_current_device
 from numba.cuda.codegen import ExternalCodeLibrary
-from numba.cuda.cudadrv import nvvm
+from numba.cuda.cudadrv import nvvm, nvrtc
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.flags import CUDAFlags
 from numba.cuda.target import CUDACABICallConv
@@ -640,7 +640,7 @@ def compile(
     # If the user has used the config variable to specify a non-default that is
     # greater than the lowest non-deprecated one, then we should default to
     # their specified CC instead of the lowest non-deprecated one.
-    MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvvm.LOWEST_CURRENT_CC)
+    MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())
     cc = cc or MIN_CC
 
     cres = compile_cuda(

diff --git a/numba_cuda/numba/cuda/cuda_paths.py b/numba_cuda/numba/cuda/cuda_paths.py
@@ -132,16 +132,9 @@ def _get_nvvm_wheel():
     return None
 
 
-def get_major_cuda_version():
-    # TODO: remove once cuda-python is
-    # a hard dependency
-    from numba.cuda.cudadrv.runtime import get_version
-
-    return get_version()[0]
-
-
 def get_nvrtc_dso_path():
     site_paths = [site.getusersitepackages()] + site.getsitepackages()
+
     for sp in site_paths:
         lib_dir = os.path.join(
             sp,
@@ -150,23 +143,28 @@ def get_nvrtc_dso_path():
             ("bin" if IS_WIN32 else "lib") if sp else None,
         )
         if lib_dir and os.path.exists(lib_dir):
-            try:
-                major = get_major_cuda_version()
-                if major == 11:
-                    cu_ver = "112" if IS_WIN32 else "11.2"
-                elif major == 12:
-                    cu_ver = "120" if IS_WIN32 else "12"
-                else:
-                    raise NotImplementedError(f"CUDA {major} is not supported")
-
-                return os.path.join(
+            chosen_path = None
+
+            # Check for each version of the NVRTC DLL, preferring the most
+            # recent.
+            versions = (
+                "112" if IS_WIN32 else "11.2",
+                "120" if IS_WIN32 else "12",
+                "130" if IS_WIN32 else "13",
+            )
+
+            for version in versions:
+                dso_path = os.path.join(
                     lib_dir,
-                    f"nvrtc64_{cu_ver}_0.dll"
+                    f"nvrtc64_{version}_0.dll"
                     if IS_WIN32
-                    else f"libnvrtc.so.{cu_ver}",
+                    else f"libnvrtc.so.{version}",
                 )
-            except RuntimeError:
-                continue
+
+                if os.path.exists(dso_path) and os.path.isfile(dso_path):
+                    chosen_path = dso_path
+
+            return chosen_path
 
 
 def _get_nvrtc_wheel():

diff --git a/numba_cuda/numba/cuda/cudadrv/error.py b/numba_cuda/numba/cuda/cudadrv/error.py
@@ -38,3 +38,7 @@ class NvrtcBuiltinOperationFailure(NvrtcError):
 
 class NvrtcSupportError(ImportError):
     pass
+
+
+class CCSupportError(RuntimeError):
+    pass
diff --git a/numba_cuda/numba/cuda/cudadrv/libs.py b/numba_cuda/numba/cuda/cudadrv/libs.py
@@ -154,7 +154,7 @@ def test():
                 print(f"\t\t{location}")
 
     # Checks for dynamic libraries
-    libs = "nvvm nvrtc cudart".split()
+    libs = "nvvm nvrtc".split()
     for lib in libs:
         path = get_cudalib(lib)
         print("Finding {} from {}".format(lib, _get_source_variable(lib)))

diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
@@ -1,6 +1,7 @@
 from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
 from enum import IntEnum
 from numba.cuda.cudadrv.error import (
+    CCSupportError,
     NvrtcError,
     NvrtcBuiltinOperationFailure,
     NvrtcCompilationError,
@@ -79,20 +80,6 @@ class NVRTC:
     (for Numba) open_cudalib function to load the NVRTC library.
     """
 
-    _CU11_2ONLY_PROTOTYPES = {
-        # nvrtcResult nvrtcGetNumSupportedArchs(int *numArchs);
-        "nvrtcGetNumSupportedArchs": (nvrtc_result, POINTER(c_int)),
-        # nvrtcResult nvrtcGetSupportedArchs(int *supportedArchs);
-        "nvrtcGetSupportedArchs": (nvrtc_result, POINTER(c_int)),
-    }
-
-    _CU12ONLY_PROTOTYPES = {
-        # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet);
-        "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
-        # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto);
-        "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p),
-    }
-
     _PROTOTYPES = {
         # nvrtcResult nvrtcVersion(int *major, int *minor)
         "nvrtcVersion": (nvrtc_result, POINTER(c_int), POINTER(c_int)),
@@ -140,6 +127,14 @@ class NVRTC:
         ),
         # nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
         "nvrtcGetProgramLog": (nvrtc_result, nvrtc_program, c_char_p),
+        # nvrtcResult nvrtcGetNumSupportedArchs(int *numArchs);
+        "nvrtcGetNumSupportedArchs": (nvrtc_result, POINTER(c_int)),
+        # nvrtcResult nvrtcGetSupportedArchs(int *supportedArchs);
+        "nvrtcGetSupportedArchs": (nvrtc_result, POINTER(c_int)),
+        # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet);
+        "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto);
+        "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p),
     }
 
     # Singleton reference
@@ -157,18 +152,18 @@ def __new__(cls):
                     cls.__INSTANCE = None
                     raise NvrtcSupportError("NVRTC cannot be loaded") from e
 
-                from numba.cuda.cudadrv.runtime import get_version
-
-                if get_version() >= (11, 2):
-                    inst._PROTOTYPES |= inst._CU11_2ONLY_PROTOTYPES
-                if get_version() >= (12, 0):
-                    inst._PROTOTYPES |= inst._CU12ONLY_PROTOTYPES
-
                 # Find & populate functions
                 for name, proto in inst._PROTOTYPES.items():
-                    func = getattr(lib, name)
-                    func.restype = proto[0]
-                    func.argtypes = proto[1:]
+                    try:
+                        func = getattr(lib, name)
+                        func.restype = proto[0]
+                        func.argtypes = proto[1:]
+                    except AttributeError:
+                        if "LTOIR" in name:
+                            # CUDA 11 does not have LTOIR functions; ignore
+                            continue
+                        else:
+                            raise
 
                     @functools.wraps(func)
                     def checked_call(*args, func=func, name=name):
@@ -195,52 +190,16 @@ def checked_call(*args, func=func, name=name):
 
         return cls.__INSTANCE
 
+    @functools.cache
     def get_supported_archs(self):
         """
         Get Supported Architectures by NVRTC as list of arch tuples.
         """
-        ver = self.get_version()
-        if ver < (11, 0):
-            raise RuntimeError(
-                "Unsupported CUDA version. CUDA 11.0 or higher is required."
-            )
-        elif ver == (11, 0):
-            return [
-                (3, 0),
-                (3, 2),
-                (3, 5),
-                (3, 7),
-                (5, 0),
-                (5, 2),
-                (5, 3),
-                (6, 0),
-                (6, 1),
-                (6, 2),
-                (7, 0),
-                (7, 2),
-                (7, 5),
-            ]
-        elif ver == (11, 1):
-            return [
-                (3, 5),
-                (3, 7),
-                (5, 0),
-                (5, 2),
-                (5, 3),
-                (6, 0),
-                (6, 1),
-                (6, 2),
-                (7, 0),
-                (7, 2),
-                (7, 5),
-                (8, 0),
-            ]
-        else:
-            num = c_int()
-            self.nvrtcGetNumSupportedArchs(byref(num))
-            archs = (c_int * num.value)()
-            self.nvrtcGetSupportedArchs(archs)
-            return [(archs[i] // 10, archs[i] % 10) for i in range(num.value)]
+        num = c_int()
+        self.nvrtcGetNumSupportedArchs(byref(num))
+        archs = (c_int * num.value)()
+        self.nvrtcGetSupportedArchs(archs)
+        return [(archs[i] // 10, archs[i] % 10) for i in range(num.value)]
 
     def get_version(self):
         """
@@ -349,9 +308,9 @@ def compile(src, name, cc, ltoir=False):
 
     version = nvrtc.get_version()
     ver_str = lambda v: ".".join(v)
-    if version < (11, 0):
+    if version < (11, 2):
         raise RuntimeError(
-            "Unsupported CUDA version. CUDA 11.0 or higher is required."
+            "Unsupported CUDA version. CUDA 11.2 or higher is required."
         )
     else:
         supported_arch = nvrtc.get_supported_archs()
@@ -383,8 +342,10 @@ def compile(src, name, cc, ltoir=False):
     else:
         arch = f"--gpu-architecture=compute_{major}{minor}"
 
-    cuda_include = [
-        f"{get_cuda_paths()['include_dir'].info}",
+    cuda_include_dir = get_cuda_paths()["include_dir"].info
+    cuda_includes = [
+        f"{cuda_include_dir}",
+        f"{os.path.join(cuda_include_dir, 'cccl')}",
     ]
 
     nvrtc_version = nvrtc.get_version()
@@ -405,7 +366,7 @@ def compile(src, name, cc, ltoir=False):
 
     nrt_include = os.path.join(numba_cuda_path, "memory_management")
 
-    includes = [numba_include, *cuda_include, nrt_include, *extra_includes]
+    includes = [numba_include, *cuda_includes, nrt_include, *extra_includes]
 
     if config.CUDA_USE_NVIDIA_BINDING:
         options = ProgramOptions(
@@ -474,3 +435,51 @@ def write(self, msg):
         else:
             ptx = nvrtc.get_ptx(program)
             return ptx, log
+
+
+def find_closest_arch(mycc):
+    """
+    Given a compute capability, return the closest compute capability supported
+    by the CUDA toolkit.
+
+    :param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
+    :return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
+    """
+    supported_ccs = get_supported_ccs()
+
+    for i, cc in enumerate(supported_ccs):
+        if cc == mycc:
+            # Matches
+            return cc
+        elif cc > mycc:
+            # Exceeded
+            if i == 0:
+                # CC lower than supported
+                msg = (
+                    "GPU compute capability %d.%d is not supported"
+                    "(requires >=%d.%d)" % (mycc + cc)
+                )
+                raise CCSupportError(msg)
+            else:
+                # return the previous CC
+                return supported_ccs[i - 1]
+
+    # CC higher than supported
+    return supported_ccs[-1]  # Choose the highest
+
+
+def get_arch_option(major, minor):
+    """Matches with the closest architecture option"""
+    if config.FORCE_CUDA_CC:
+        arch = config.FORCE_CUDA_CC
+    else:
+        arch = find_closest_arch((major, minor))
+    return "compute_%d%d" % arch
+
+
+def get_lowest_supported_cc():
+    return min(get_supported_ccs())
+
+
+def get_supported_ccs():
+    return NVRTC().get_supported_archs()
-Original file line number
+Diff line change
@@ Expand Up / @@ -38,3 +38,7 @@ class NvrtcBuiltinOperationFailure(NvrtcError): @@
     class NvrtcSupportError(ImportError):
         pass
+    class CCSupportError(RuntimeError):
+        pass