NVIDIA · gmarkall · Jun 27, 2025 · Feb 21, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -19,11 +19,11 @@ jobs:
       - compute-matrix
       - build-conda
       - test-conda
-      - test-conda-pynvjitlink
+      - test-conda-ctypes-binding
       - test-simulator
       - build-wheels
       - test-wheels
-      - test-wheels-pynvjitlink
+      - test-wheels-ctypes-binding
       - test-wheels-deps-wheels
       - build-docs
     secrets: inherit
@@ -76,14 +76,14 @@ jobs:
       script: "ci/test_conda.sh"
       run_codecov: false
       matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
-  test-conda-pynvjitlink:
+  test-conda-ctypes-binding:
     needs:
       - build-conda
       - compute-matrix
     uses: ./.github/workflows/conda-python-tests.yaml
     with:
       build_type: pull-request
-      script: "ci/test_conda_pynvjitlink.sh"
+      script: "ci/test_conda_ctypes_binding.sh"
       run_codecov: false
       # This selects "ARCH=amd64 and CUDA >=12, with the latest supported Python for each CUDA major version".
       matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
@@ -114,14 +114,14 @@ jobs:
       build_type: pull-request
       script: "ci/test_wheel.sh false"
       matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
-  test-wheels-pynvjitlink:
+  test-wheels-ctypes-binding:
     needs:
       - build-wheels
       - compute-matrix
     uses: ./.github/workflows/wheels-test.yaml
     with:
       build_type: pull-request
-      script: "ci/test_wheel_pynvjitlink.sh"
+      script: "ci/test_wheel_ctypes_binding.sh"
       # This selects "ARCH=amd64 and CUDA >=12, with the latest supported Python for each CUDA major version".
       matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
       matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER | split(".") | .[0] | tonumber >= 12))) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 if [ "${CUDA_VER%.*.*}" = "11" ]; then
   CTK_PACKAGES="cudatoolkit=11"
 else
-  CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc libcurand-dev"
+  CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc libcurand-dev cuda-cuobjdump"
   apt-get update
   apt remove --purge `dpkg --get-selections | grep cuda-nvvm | awk '{print $1}'` -y
   apt remove --purge `dpkg --get-selections | grep cuda-nvrtc | awk '{print $1}'` -y

diff --git a/ci/test_conda_pynvjitlink.sh → ci/test_conda_ctypes_binding.sh b/ci/test_conda_pynvjitlink.sh → ci/test_conda_ctypes_binding.sh
@@ -48,12 +48,6 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-
-rapids-logger "Install pynvjitlink"
-set +u
-rapids-mamba-retry install -c rapidsai pynvjitlink
-set -u
-
 rapids-logger "Build tests"
 
 PY_SCRIPT="
@@ -70,7 +64,7 @@ popd
 
 
 rapids-logger "Run Tests"
-NUMBA_CUDA_ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v
+NUMBA_CUDA_USE_NVIDIA_BINDING=0 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v
 
 popd
 

diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
@@ -10,7 +10,8 @@ package=$(realpath wheel/numba_cuda*.whl)
 echo "Package path: ${package}"
 python -m pip install \
     "${package}[test]" \
-    "cuda-python==${CUDA_VER_MAJOR_MINOR%.*}.*"
+    "cuda-python==${CUDA_VER_MAJOR_MINOR%.*}.*" \
+    "cuda-core==0.3.*"
 
 GET_TEST_BINARY_DIR="
 import numba_cuda

diff --git a/ci/test_wheel_pynvjitlink.sh → ci/test_wheel_ctypes_binding.sh b/ci/test_wheel_pynvjitlink.sh → ci/test_wheel_ctypes_binding.sh
@@ -11,7 +11,6 @@ echo "Package path: $package"
 python -m pip install \
     "${package}[test]" \
     cuda-python \
-    "pynvjitlink-cu${CUDA_VER_MAJOR}"
 
 rapids-logger "Build tests"
 PY_SCRIPT="
@@ -23,7 +22,7 @@ print(test_dir)
 
 NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$PY_SCRIPT")
 pushd $NUMBA_CUDA_TEST_BIN_DIR
-make
+NUMBA_CUDA_USE_NVIDIA_BINDING=0 make
 popd
 
 
@@ -35,9 +34,9 @@ mkdir -p "${RAPIDS_TESTS_DIR}"
 pushd "${RAPIDS_TESTS_DIR}"
 
 rapids-logger "Show Numba system info"
-python -m numba --sysinfo
+NUMBA_CUDA_USE_NVIDIA_BINDING=0 python -m numba --sysinfo
 
 rapids-logger "Run Tests"
-NUMBA_CUDA_ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v
+NUMBA_CUDA_USE_NVIDIA_BINDING=0 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v
 
 popd
diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
@@ -29,6 +29,7 @@ requirements:
     - python
     - numba >=0.59.1
     - cuda-bindings
+    - cuda-core ==0.3.*
 
 about:
   home: {{ project_urls["Homepage"] }}

diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
@@ -2,11 +2,16 @@
 from numba import runtests
 from numba.core import config
 from .utils import _readenv
+import warnings
+
 
 # Enable pynvjitlink based on the following precedence:
 # 1. Config setting "CUDA_ENABLE_PYNVJITLINK" (highest priority)
 # 2. Environment variable "NUMBA_CUDA_ENABLE_PYNVJITLINK"
 # 3. Auto-detection of pynvjitlink module (lowest priority)
+
+pynvjitlink_auto_enabled = False
+
 if getattr(config, "CUDA_ENABLE_PYNVJITLINK", None) is None:
     if (
         _pynvjitlink_enabled_in_env := _readenv(
@@ -15,9 +20,10 @@
     ) is not None:
         config.CUDA_ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_env
     else:
-        config.CUDA_ENABLE_PYNVJITLINK = (
+        pynvjitlink_auto_enabled = (
             importlib.util.find_spec("pynvjitlink") is not None
         )
+        config.CUDA_ENABLE_PYNVJITLINK = pynvjitlink_auto_enabled
 
 # Upstream numba sets CUDA_USE_NVIDIA_BINDING to 0 by default, so it always
 # exists. Override, but not if explicitly set to 0 in the envioronment.
@@ -44,6 +50,21 @@
             "bindings."
         )
 
+if config.CUDA_ENABLE_PYNVJITLINK:
+    if USE_NV_BINDING:
+        warnings.warn(
+            "Explicitly enabling pynvjitlink is no longer necessary. "
+            "NVIDIA bindings are enabled. cuda.core will be used "
+            "in place of pynvjitlink."
+        )
+    elif pynvjitlink_auto_enabled:
+        # Ignore the fact that pynvjitlink is enabled, because that was an
+        # automatic decision based on discovering pynvjitlink was present; the
+        # user didn't ask for it
+        pass
+    else:
+        raise RuntimeError("nvJitLink requires the NVIDIA CUDA bindings. ")
+
 if config.ENABLE_CUDASIM:
     from .simulator_init import *
 else:

diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
@@ -22,7 +22,10 @@ def run_nvdisasm(cubin, flags):
     try:
         fd, fname = tempfile.mkstemp()
         with open(fname, "wb") as f:
-            f.write(cubin)
+            if config.CUDA_USE_NVIDIA_BINDING:
+                f.write(cubin.code)
+            else:
+                f.write(cubin)
 
         try:
             cp = subprocess.run(
@@ -271,7 +274,7 @@ def get_cubin(self, cc=None):
             return cubin
 
         if self._lto and config.DUMP_ASSEMBLY:
-            linker = driver.Linker.new(
+            linker = driver._Linker.new(
                 max_registers=self._max_registers,
                 cc=cc,
                 additional_flags=["-ptx"],
@@ -280,14 +283,14 @@ def get_cubin(self, cc=None):
             # `-ptx` flag is meant to view the optimized PTX for LTO objects.
             # Non-LTO objects are not passed to linker.
             self._link_all(linker, cc, ignore_nonlto=True)
-
-            ptx = linker.get_linked_ptx().decode("utf-8")
+            ptx = linker.get_linked_ptx()
+            ptx = ptx.decode("utf-8")
 
             print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, "-"))
             print(ptx)
             print("=" * 80)
 
-        linker = driver.Linker.new(
+        linker = driver._Linker.new(
             max_registers=self._max_registers, cc=cc, lto=self._lto
         )
         self._link_all(linker, cc, ignore_nonlto=False)
@@ -312,7 +315,6 @@ def get_cufunc(self):
         cufunc = self._cufunc_cache.get(device.id, None)
         if cufunc:
             return cufunc
-
         cubin = self.get_cubin(cc=device.compute_capability)
         module = ctx.create_module_image(
             cubin, self._setup_functions, self._teardown_functions