diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
new file mode 100644
index 000000000..b6da52c2a
--- /dev/null
+++ b/.github/actions/fetch_ctk/action.yml
@@ -0,0 +1,193 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: Fetch mini CTK
+
+description: Fetch (or create) a mini CUDA Toolkit from cache
+
+inputs:
+  host-platform:
+    required: true
+    type: string
+  cuda-version:
+    required: true
+    type: string
+  cuda-components:
+    description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
+    required: false
+    type: string
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+  cuda-path:
+    description: "where the CTK components will be installed to, relative to $PWD"
+    required: false
+    type: string
+    default: "./cuda_toolkit"
+
+runs:
+  using: composite
+  steps:
+    - name: Set up CTK cache variable
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        # Pre-process the component list to ensure hash uniqueness
+        CTK_CACHE_COMPONENTS=${{ inputs.cuda-components }}
+        # Conditionally strip out libnvjitlink for CUDA versions < 12
+        CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
+        if [[ "$CUDA_MAJOR_VER" -lt 12 ]]; then
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvjitlink/}"
+        fi
+        # Conditionally strip out cuda_crt and libnvvm for CUDA versions < 13
+        CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
+        if [[ "$CUDA_MAJOR_VER" -lt 13 ]]; then
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//cuda_crt/}"
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvvm/}"
+        fi
+        # Conditionally strip out libcufile since it does not support Windows
+        if [[ "${{ inputs.host-platform }}" == win-* ]]; then
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"
+        fi
+        # Cleanup stray commas after removing components
+        CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
+
+        HASH=$(echo -n "${CTK_CACHE_COMPONENTS}" | sha256sum | awk '{print $1}')
+        echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}-$HASH" >> $GITHUB_ENV
+        echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}-$HASH.tar.gz" >> $GITHUB_ENV
+        echo "CTK_CACHE_COMPONENTS=${CTK_CACHE_COMPONENTS}" >> $GITHUB_ENV
+
+    - name: Install dependencies
+      uses: ./.github/actions/install_unix_deps
+      continue-on-error: false
+      with:
+        dependencies: "zstd curl xz-utils"
+        dependent_exes: "zstd curl xz"
+
+    - name: Download CTK cache
+      id: ctk-get-cache
+      uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684  # v4.2.3
+      continue-on-error: true
+      with:
+        key: ${{ env.CTK_CACHE_KEY }}
+        path: ./${{ env.CTK_CACHE_FILENAME }}
+        fail-on-cache-miss: false
+
+    - name: Get CUDA components
+      if: ${{ steps.ctk-get-cache.outputs.cache-hit != 'true' }}
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        # Everything under this folder is packed and stored in the GitHub Cache space,
+        # and unpacked after retrieving from the cache.
+        CACHE_TMP_DIR="./cache_tmp_dir"
+        rm -rf $CACHE_TMP_DIR
+        mkdir $CACHE_TMP_DIR
+
+        # The binary archives (redist) are guaranteed to be updated as part of the release posting.
+        CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/"
+        CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json"
+        if [[ "${{ inputs.host-platform }}" == linux* ]]; then
+          if [[ "${{ inputs.host-platform }}" == "linux-64" ]]; then
+            CTK_SUBDIR="linux-x86_64"
+          elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
+            CTK_SUBDIR="linux-sbsa"
+          fi
+          function extract() {
+            tar -xvf $1 -C $CACHE_TMP_DIR --strip-components=1
+          }
+        elif [[ "${{ inputs.host-platform }}" == "win-64" ]]; then
+          CTK_SUBDIR="windows-x86_64"
+          function extract() {
+            _TEMP_DIR_=$(mktemp -d)
+            unzip $1 -d $_TEMP_DIR_
+            cp -r $_TEMP_DIR_/*/* $CACHE_TMP_DIR
+            rm -rf $_TEMP_DIR_
+            # see commit NVIDIA/cuda-python@69410f1d9228e775845ef6c8b4a9c7f37ffc68a5
+            chmod 644 $CACHE_TMP_DIR/LICENSE
+          }
+        fi
+        function populate_cuda_path() {
+          # take the component name as a argument
+          function download() {
+            curl -kLSs $1 -o $2
+          }
+          CTK_COMPONENT=$1
+          CTK_COMPONENT_REL_PATH="$(curl -s $CTK_JSON_URL |
+              python -c "import sys, json; print(json.load(sys.stdin)['${CTK_COMPONENT}']['${CTK_SUBDIR}']['relative_path'])")"
+          CTK_COMPONENT_URL="${CTK_BASE_URL}/${CTK_COMPONENT_REL_PATH}"
+          CTK_COMPONENT_COMPONENT_FILENAME="$(basename $CTK_COMPONENT_REL_PATH)"
+          download $CTK_COMPONENT_URL $CTK_COMPONENT_COMPONENT_FILENAME
+          extract $CTK_COMPONENT_COMPONENT_FILENAME
+          rm $CTK_COMPONENT_COMPONENT_FILENAME
+        }
+
+        # Get headers and shared libraries in place
+        for item in $(echo $CTK_CACHE_COMPONENTS | tr ',' ' '); do
+            populate_cuda_path "$item"
+        done
+        # TODO: check Windows
+        if [[ "${{ inputs.host-platform }}" == linux* ]]; then
+          mv $CACHE_TMP_DIR/lib $CACHE_TMP_DIR/lib64
+        fi
+        ls -l $CACHE_TMP_DIR
+
+        # Prepare the cache
+        # Note: try to escape | and > ...
+        tar -czvf ${CTK_CACHE_FILENAME} ${CACHE_TMP_DIR}
+
+        # "Move" files from temp dir to CUDA_PATH
+        CUDA_PATH="./cuda_toolkit"
+        mkdir -p $CUDA_PATH
+        # Unfortunately we cannot use "rsync -av $CACHE_TMP_DIR/ $CUDA_PATH" because
+        # not all runners have rsync pre-installed (or even installable, such as
+        # Git Bash). We do it in the dumb way.
+        cp -r $CACHE_TMP_DIR/* $CUDA_PATH
+        rm -rf $CACHE_TMP_DIR
+        ls -l $CUDA_PATH
+
+    - name: Upload CTK cache
+      if: ${{ !cancelled() &&
+              steps.ctk-get-cache.outputs.cache-hit != 'true' }}
+      uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684  # v4.2.3
+      with:
+        key: ${{ env.CTK_CACHE_KEY }}
+        path: ./${{ env.CTK_CACHE_FILENAME }}
+
+    - name: Restore CTK cache
+      if: ${{ steps.ctk-get-cache.outputs.cache-hit == 'true' }}
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        ls -l
+        CACHE_TMP_DIR="./cache_tmp_dir"
+        CUDA_PATH="./cuda_toolkit"
+        mkdir -p $CUDA_PATH
+        tar -xzvf $CTK_CACHE_FILENAME
+        # Can't use rsync here, see above
+        cp -r $CACHE_TMP_DIR/* $CUDA_PATH
+        rm -rf $CACHE_TMP_DIR $CTK_CACHE_FILENAME
+        ls -l $CUDA_PATH
+        if [ ! -d "$CUDA_PATH/include" ]; then
+          exit 1
+        fi
+
+    - name: Move CTK to the specified location
+      if: ${{ inputs.cuda-path != './cuda_toolkit' }}
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        mv ./cuda_toolkit ${{ inputs.cuda-path }}
+
+    - name: Set output environment variables
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        # mimics actual CTK installation
+        if [[ "${{ inputs.host-platform }}" == linux* ]]; then
+          CUDA_PATH=$(realpath "${{ inputs.cuda-path }}")
+          echo "${CUDA_PATH}/bin" >> $GITHUB_PATH
+          echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV
+        elif [[ "${{ inputs.host-platform }}" == win* ]]; then
+          function normpath() {
+            echo "$(echo $(cygpath -w $1) | sed 's/\\/\\\\/g')"
+          }
+          CUDA_PATH=$(normpath $(realpath "${{ inputs.cuda-path }}"))
+          echo "$(normpath ${CUDA_PATH}/bin)" >> $GITHUB_PATH
+        fi
+        echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
+        echo "CUDA_HOME=${CUDA_PATH}" >> $GITHUB_ENV
diff --git a/.github/actions/get_pr_number/action.yml b/.github/actions/get_pr_number/action.yml
new file mode 100644
index 000000000..1641f8068
--- /dev/null
+++ b/.github/actions/get_pr_number/action.yml
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: Get the PR number
+
+description: Get the PR number without relying on the pull_request* event triggers.
+
+runs:
+  using: composite
+  steps:
+    - name: Get PR info (non-main, non-release branch)
+      if: ${{ github.ref_name != 'main' && !startsWith(github.ref_name, 'release/') }}
+      uses: nv-gha-runners/get-pr-info@main
+      id: get-pr-info
+
+    - name: Extract PR number (non-main, non-release branch)
+      if: ${{ github.ref_name != 'main' && !startsWith(github.ref_name, 'release/') }}
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        trap 'echo "Error at line $LINENO"; exit 1' ERR
+        PR_NUMBER="${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}"
+        if [[ -z "$PR_NUMBER" ]]; then
+          echo "Cannot extract PR number for ref: ${{ github.ref_name }}"
+          exit 1
+        fi
+        echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
+        echo "BUILD_PREVIEW=1" >> $GITHUB_ENV
+
+    - name: Get PR data (main or release/* branch)
+      if: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release/') }}
+      uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
+      id: get-pr-data
+      with:
+        script: |
+          const prs = await github.rest.repos.listPullRequestsAssociatedWithCommit({
+            commit_sha: context.sha,
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+          });
+          if (!prs.data.length) {
+            core.setFailed("No PR associated with this commit on 'main' or 'release/*'.");
+          } else {
+            return prs.data[0];
+          }
+
+    - name: Extract PR number (main or release/* branch)
+      if: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release/') }}
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        trap 'echo "Error at line $LINENO"; exit 1' ERR
+        PR_NUMBER="${{ fromJSON(steps.get-pr-data.outputs.result).number }}"
+        if [[ -z "$PR_NUMBER" ]]; then
+          echo "No associated PR found for the commit in 'main' or 'release/*'."
+          exit 1
+        fi
+        echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
+        echo "BUILD_LATEST=1" >> $GITHUB_ENV
diff --git a/.github/actions/install_unix_deps/action.yml b/.github/actions/install_unix_deps/action.yml
new file mode 100644
index 000000000..6289541c9
--- /dev/null
+++ b/.github/actions/install_unix_deps/action.yml
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: Install dependencies on Ubuntu
+
+description: Install needed dependencies, regardless if using GitHub- or self- hosted runners, container, sudo or not.
+
+inputs:
+  dependencies:
+    required: true
+    type: string
+  dependent_exes:
+    required: true
+    type: string
+
+runs:
+  using: composite
+  steps:
+    - name: Install dependencies
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        dependencies=(${{ inputs.dependencies }})
+        dependent_exes=(${{ inputs.dependent_exes }})
+
+        not_found=0
+        for dep in ${dependent_exes[@]}; do
+          if ! (command -v $dep 2>&1 >/dev/null); then
+            not_found=1
+            break
+          fi
+        done
+        if [[ $not_found == 0 ]]; then
+          echo "All dependencies are found. Do nothing."
+          exit 0
+        fi
+        if ! (command -v sudo 2>&1 >/dev/null); then
+          if [[ $EUID == 0 ]]; then
+            alias SUDO=""
+          else
+            echo "The following oprations require root access."
+            exit 1
+          fi
+        else
+          alias SUDO="sudo"
+        fi
+        shopt -s expand_aliases
+        SUDO apt update
+        SUDO apt install -y ${dependencies[@]}
diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
new file mode 100644
index 000000000..f5d6aae12
--- /dev/null
+++ b/.github/workflows/build-wheel.yml
@@ -0,0 +1,255 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+on:
+  workflow_call:
+    inputs:
+      host-platform:
+        required: true
+        type: string
+      cuda-version:
+        required: true
+        type: string
+      prev-cuda-version:
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash --noprofile --norc -xeuo pipefail {0}
+
+permissions:
+  contents: read  # This is required for actions/checkout
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version:
+          - "3.10"
+          - "3.11"
+          - "3.12"
+          - "3.13"
+          # - "3.14"
+          # - "3.14t"
+    name: py${{ matrix.python-version }}
+    runs-on: ${{ (inputs.host-platform == 'linux-64' && 'linux-amd64-cpu8') ||
+                 (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
+                 (inputs.host-platform == 'win-64' && 'windows-2022') }}
+    steps:
+      - name: Checkout ${{ github.event.repository.name }}
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+        with:
+          fetch-depth: 0
+
+      # The env vars ACTIONS_CACHE_SERVICE_V2, ACTIONS_RESULTS_URL, and ACTIONS_RUNTIME_TOKEN
+      # are exposed by this action.
+      - name: Enable sccache
+        if: ${{ startsWith(inputs.host-platform, 'linux') }}
+        uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad  # 0.0.9
+
+      # xref: https://github.com/orgs/community/discussions/42856#discussioncomment-7678867
+      - name: Adding addtional GHA cache-related env vars
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env['ACTIONS_CACHE_URL'])
+            core.exportVariable('ACTIONS_RUNTIME_URL', process.env['ACTIONS_RUNTIME_URL'])
+
+      - name: Setup proxy cache
+        uses: nv-gha-runners/setup-proxy-cache@main
+        continue-on-error: true
+        # Skip the cache on Windows nodes outside of our org.
+        if: ${{ inputs.host-platform != 'win-64' }}
+
+      - name: Set up Python
+        id: setup-python1
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
+        with:
+          # WAR: setup-python is not relocatable, and cibuildwheel hard-wires to 3.12...
+          # see https://github.com/actions/setup-python/issues/871
+          python-version: "3.12"
+
+      - name: Set up MSVC
+        if: ${{ startsWith(inputs.host-platform, 'win') }}
+        uses: ilammy/msvc-dev-cmd@v1  # TODO: ask admin to allow pinning commits
+
+      - name: Set environment variables
+        env:
+          HOST_PLATFORM: ${{ inputs.host-platform }}
+          PY_VER: ${{ matrix.python-version }}
+          SHA: ${{ github.sha }}
+        run: ./ci/tools/env-vars build
+
+      - name: Dump environment
+        run: |
+          env
+
+      - name: Build numba-cuda wheel
+        uses: pypa/cibuildwheel@9c00cb4f6b517705a3794b22395aedc36257242c  # v3.2.1
+        with:
+          package-dir: .
+          output-dir: ${{ env.NUMBA_CUDA_ARTIFACTS_DIR }}
+        env:
+          CIBW_BUILD: ${{ env.CIBW_BUILD }}
+          # CIBW mounts the host filesystem under /host
+          CIBW_ENVIRONMENT_LINUX: >
+            CC="/host/${{ env.SCCACHE_PATH }} cc"
+            CXX="/host/${{ env.SCCACHE_PATH }} c++"
+            SCCACHE_GHA_ENABLED=true
+            ACTIONS_RUNTIME_TOKEN=${{ env.ACTIONS_RUNTIME_TOKEN }}
+            ACTIONS_RUNTIME_URL=${{ env.ACTIONS_RUNTIME_URL }}
+            ACTIONS_RESULTS_URL=${{ env.ACTIONS_RESULTS_URL }}
+            ACTIONS_CACHE_URL=${{ env.ACTIONS_CACHE_URL }}
+            ACTIONS_CACHE_SERVICE_V2=${{ env.ACTIONS_CACHE_SERVICE_V2 }}
+            SCCACHE_DIR=/host/${{ env.SCCACHE_DIR }}
+            SCCACHE_CACHE_SIZE=${{ env.SCCACHE_CACHE_SIZE }}
+          # check cache stats before leaving cibuildwheel
+          CIBW_BEFORE_TEST_LINUX: >
+            "/host/${{ env.SCCACHE_PATH }}" --show-stats
+          # force the test stage to be run (so that before-test is not skipped)
+          # TODO: we might want to think twice on adding this, it does a lot of
+          # things before reaching this command.
+          CIBW_TEST_COMMAND_LINUX: >
+            echo "ok!"
+
+      - name: List the numba-cuda artifacts directory
+        run: |
+          if [[ "${{ inputs.host-platform }}" == win* ]]; then
+            export CHOWN=chown
+          else
+            export CHOWN="sudo chown"
+          fi
+          $CHOWN -R $(whoami) ${{ env.NUMBA_CUDA_ARTIFACTS_DIR }}
+          ls -lahR ${{ env.NUMBA_CUDA_ARTIFACTS_DIR }}
+
+      - name: Upload numba-cuda build artifacts
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4.6.2
+        with:
+          name: ${{ env.NUMBA_CUDA_ARTIFACT_NAME }}
+          path: ${{ env.NUMBA_CUDA_ARTIFACTS_DIR }}/*.whl
+          if-no-files-found: error
+
+  build-tests:
+    needs:
+      - build
+    strategy:
+      fail-fast: false
+      matrix:
+        # We just need 1 Python version because the artifacts are Python agnostic.
+        python-version:
+          - "3.10"
+        cuda-version:
+          - ${{ inputs.cuda-version }}
+          - ${{ inputs.prev-cuda-version }}
+    name: py${{ matrix.python-version }} CUDA ${{ matrix.cuda-version }}
+    runs-on: ${{ (inputs.host-platform == 'linux-64' && 'linux-amd64-cpu8') ||
+                 (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
+                 (inputs.host-platform == 'win-64' && 'windows-2022') }}
+    steps:
+      - name: Checkout ${{ github.event.repository.name }}
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+        with:
+          fetch-depth: 0
+
+      # The env vars ACTIONS_CACHE_SERVICE_V2, ACTIONS_RESULTS_URL, and ACTIONS_RUNTIME_TOKEN
+      # are exposed by this action.
+      - name: Enable sccache
+        if: ${{ startsWith(inputs.host-platform, 'linux') }}
+        uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad  # 0.0.9
+
+      # xref: https://github.com/orgs/community/discussions/42856#discussioncomment-7678867
+      - name: Adding addtional GHA cache-related env vars
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env['ACTIONS_CACHE_URL'])
+            core.exportVariable('ACTIONS_RUNTIME_URL', process.env['ACTIONS_RUNTIME_URL'])
+
+      - name: Setup proxy cache
+        uses: nv-gha-runners/setup-proxy-cache@main
+        continue-on-error: true
+        # Skip the cache on Windows nodes outside of our org.
+        if: ${{ inputs.host-platform != 'win-64' }}
+
+      - name: Set up Python
+        id: setup-python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Set up MSVC
+        if: ${{ startsWith(inputs.host-platform, 'win') }}
+        uses: ilammy/msvc-dev-cmd@v1  # TODO: ask admin to allow pinning commits
+
+      - name: Set environment variables
+        env:
+          HOST_PLATFORM: ${{ inputs.host-platform }}
+          PY_VER: ${{ matrix.python-version }}
+          SHA: ${{ github.sha }}
+        run: |
+          ./ci/tools/env-vars build
+          CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ matrix.cuda-version }})"
+          echo "CUDA_MAJOR=${CUDA_MAJOR}" >> ${GITHUB_ENV}
+
+      - name: Dump environment
+        run: |
+          env
+
+      - name: Download numba-cuda build artifacts
+        if: ${{ env.SKIP_NUMBA_CUDA_TEST == '0'}}
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0  # v5.0.0
+        with:
+          name: ${{ env.NUMBA_CUDA_ARTIFACT_NAME }}
+          path: ${{ env.NUMBA_CUDA_ARTIFACTS_DIR }}
+
+      - name: Display structure of downloaded numba-cuda artifacts
+        run: |
+          pwd
+          ls -lahR ${NUMBA_CUDA_ARTIFACTS_DIR}
+
+      - name: Install numba-cuda
+        run: |
+          # used in testing/Makefile
+          pip install ${{ env.NUMBA_CUDA_ARTIFACTS_DIR }}/*.whl "cuda-bindings==${CUDA_MAJOR}.*"
+
+      - name: Set up mini CTK ${{ matrix.cuda-version }}
+        uses: ./.github/actions/fetch_ctk
+        continue-on-error: false
+        with:
+          host-platform: ${{ inputs.host-platform }}
+          cuda-version: ${{ matrix.cuda-version }}
+          cuda-components: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_cccl,libnvjitlink,cuda_cuobjdump"
+
+      - name: Build numba-cuda test artifacts aginst CUDA ${{ matrix.cuda-version }}
+        run: |
+          pushd testing
+          if [[ "${{ inputs.host-platform }}" == linux* ]]; then
+            PATH=$(dirname ${SCCACHE_PATH}):${PATH}
+            SCCACHE_GHA_ENABLED=true
+          fi
+
+          nvcc --version
+
+          # TODO: move this list to json
+          if [[ "${CUDA_MAJOR}" == 12 ]]; then
+            CC_LIST=(70 75 80 86 89 90 120)
+          elif [[ "${CUDA_MAJOR}" == 13 ]]; then
+            CC_LIST=(75 80 86 89 90 120)
+          fi
+
+          for cc in ${CC_LIST[*]}; do
+            make -j $(nproc) GPU_CC=${cc}
+            mkdir cu${CUDA_MAJOR}_cc${cc}
+            mv *.cubin *.fatbin *.ptx *.o *.a *.ltoir cu${CUDA_MAJOR}_cc${cc}
+          done
+          popd
+
+      - name: Upload numba-cuda test artifacts
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4.6.2
+        with:
+          name: ${{ env.NUMBA_CUDA_TEST_ARTIFACT_NAME }}-cu${{ env.CUDA_MAJOR }}
+          path: testing/cu*
+          if-no-files-found: error
diff --git a/.github/workflows/ci-new.yaml b/.github/workflows/ci-new.yaml
new file mode 100644
index 000000000..c41164521
--- /dev/null
+++ b/.github/workflows/ci-new.yaml
@@ -0,0 +1,206 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Note: This name is referred to in the test job, so make sure any changes are sync'd up!
+# Further this is referencing a run in the backport branch to fetch old bindings.
+name: "CI"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+      - "main"
+
+jobs:
+  ci-vars:
+    runs-on: ubuntu-latest
+    outputs:
+      CUDA_BUILD_VER: ${{ steps.get-vars.outputs.cuda_build_ver }}
+      CUDA_PREV_BUILD_VER: ${{ steps.get-vars.outputs.cuda_prev_build_ver }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+        with:
+          fetch-depth: 0
+
+      - name: Get CUDA build versions
+        id: get-vars
+        run: |
+          cuda_build_ver=$(jq -r .cuda.build.version ci/versions.json)
+          echo "cuda_build_ver=$cuda_build_ver" >> $GITHUB_OUTPUT
+          cuda_prev_build_ver=$(jq -r .cuda.prev_build.version ci/versions.json)
+          echo "cuda_prev_build_ver=$cuda_prev_build_ver" >> $GITHUB_OUTPUT
+
+  should-skip:
+    runs-on: ubuntu-latest
+    outputs:
+      skip: ${{ steps.get-should-skip.outputs.skip }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - name: Compute whether to skip builds and tests
+        id: get-should-skip
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          set -euxo pipefail
+          if ${{ startsWith(github.ref_name, 'pull-request/') }}; then
+            skip="$(gh pr view "$(grep -Po '(\d+)$' <<< '${{ github.ref_name }}')" --json title --jq '.title | contains("[no-ci]")')"
+          else
+            skip=false
+          fi
+          echo "skip=${skip}" >> "$GITHUB_OUTPUT"
+
+  # WARNING: make sure all of the build jobs are in sync
+  build-linux-64:
+    needs:
+      - ci-vars
+      - should-skip
+    strategy:
+      fail-fast: false
+      matrix:
+        host-platform:
+          - linux-64
+    name: Build ${{ matrix.host-platform }}
+    if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) }}
+    secrets: inherit
+    uses: ./.github/workflows/build-wheel.yml
+    with:
+      host-platform: ${{ matrix.host-platform }}
+      cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
+
+  # WARNING: make sure all of the build jobs are in sync
+  build-linux-aarch64:
+    needs:
+      - ci-vars
+      - should-skip
+    strategy:
+      fail-fast: false
+      matrix:
+        host-platform:
+          - linux-aarch64
+    name: Build ${{ matrix.host-platform }}
+    if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) }}
+    secrets: inherit
+    uses: ./.github/workflows/build-wheel.yml
+    with:
+      host-platform: ${{ matrix.host-platform }}
+      cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
+
+  # WARNING: make sure all of the build jobs are in sync
+  build-windows:
+    needs:
+      - ci-vars
+      - should-skip
+    strategy:
+      fail-fast: false
+      matrix:
+        host-platform:
+          - win-64
+    name: Build ${{ matrix.host-platform }}
+    if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) }}
+    secrets: inherit
+    uses: ./.github/workflows/build-wheel.yml
+    with:
+      host-platform: ${{ matrix.host-platform }}
+      cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
+      prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }}
+
+  # WARNING: make sure both Linux test jobs are in sync
+  test-linux-64:
+    strategy:
+      fail-fast: false
+      matrix:
+        host-platform:
+          - linux-64
+    name: Test ${{ matrix.host-platform }}
+    if: ${{ github.repository_owner == 'nvidia' }}
+    permissions:
+      contents: read  # This is required for actions/checkout
+    needs:
+      - build-linux-64
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: pull-request
+      host-platform: ${{ matrix.host-platform }}
+
+  # WARNING: make sure both Linux test jobs are in sync
+  test-linux-aarch64:
+    strategy:
+      fail-fast: false
+      matrix:
+        host-platform:
+          - linux-aarch64
+    name: Test ${{ matrix.host-platform }}
+    if: ${{ github.repository_owner == 'nvidia' }}
+    permissions:
+      contents: read  # This is required for actions/checkout
+    needs:
+      - build-linux-aarch64
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: pull-request
+      host-platform: ${{ matrix.host-platform }}
+
+  test-windows:
+    strategy:
+      fail-fast: false
+      matrix:
+        host-platform:
+          - win-64
+    name: Test ${{ matrix.host-platform }}
+    if: ${{ github.repository_owner == 'nvidia' }}
+    permissions:
+      contents: read  # This is required for actions/checkout
+    needs:
+      - build-windows
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-windows.yml
+    with:
+      build-type: pull-request
+      host-platform: ${{ matrix.host-platform }}
+
+  checks:
+    name: Check job status
+    if: always()
+    runs-on: ubuntu-latest
+    needs:
+      - test-linux-64
+      - test-linux-aarch64
+      - test-windows
+    steps:
+      - name: Exit
+        run: |
+          # if any dependencies were cancelled, that's a failure
+          #
+          # see https://docs.github.com/en/actions/reference/workflows-and-actions/expressions#always
+          # and https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/troubleshooting-required-status-checks#handling-skipped-but-required-checks
+          # for why this cannot be encoded in the job-level `if:` field
+          #
+          # TL; DR: `$REASONS`
+          #
+          # The intersection of skipped-as-success and required status checks
+          # creates a scenario where if you DON'T `always()` run this job, the
+          # status check UI will block merging and if you DO `always()` run and
+          # a dependency is _cancelled_ (due to a critical failure, which is
+          # somehow not considered a failure ¯\_(ツ)_/¯) then the critically
+          # failing job(s) will timeout causing a cancellation here and the
+          # build to succeed which we don't want (originally this was just
+          # 'exit 0')
+          if ${{ needs.test-linux-64.result == 'cancelled' ||
+                 needs.test-linux-aarch64.result == 'cancelled' ||
+                 needs.test-windows.result == 'cancelled' ||
+                 needs.doc.result == 'cancelled' }}; then
+            exit 1
+          else
+            exit 0
+          fi
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
new file mode 100644
index 000000000..be8aa739f
--- /dev/null
+++ b/.github/workflows/test-wheel-linux.yml
@@ -0,0 +1,177 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: "CI: Test wheels"
+
+on:
+  workflow_call:
+    inputs:
+      build-type:
+        type: string
+        required: true
+      host-platform:
+        type: string
+        required: true
+      matrix_filter:
+        type: string
+        default: "."
+
+defaults:
+  run:
+    shell: bash --noprofile --norc -xeuo pipefail {0}
+
+jobs:
+  compute-matrix:
+    runs-on: ubuntu-latest
+    env:
+      BUILD_TYPE: ${{ inputs.build-type }}
+      ARCH: ${{ (inputs.host-platform == 'linux-64' && 'amd64') ||
+                (inputs.host-platform == 'linux-aarch64' && 'arm64') }}
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Checkout ${{ github.event.repository.name }}
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+
+      - name: Validate Test Type
+        run: |
+          if [[ "$BUILD_TYPE" != "pull-request" ]] && [[ "$BUILD_TYPE" != "nightly" ]] && [[ "$BUILD_TYPE" != "branch" ]]; then
+              echo "Invalid build type! Must be one of 'nightly', 'pull-request', or 'branch'."
+              exit 1
+          fi
+
+      - name: Compute Python Test Matrix
+        id: compute-matrix
+        run: |
+          # Use the nightly matrix for branch tests
+          MATRIX_TYPE="${BUILD_TYPE}"
+          if [[ "${MATRIX_TYPE}" == "branch" ]]; then
+            MATRIX_TYPE="nightly"
+          fi
+
+          # Read base matrix from JSON file for the specific architecture
+          TEST_MATRIX=$(jq --arg arch "$ARCH" --arg matrix_type "$MATRIX_TYPE" '
+            .linux[$matrix_type] |
+            map(select(.ARCH == $arch))
+          ' ci/test-matrix.json)
+
+          # Add special runner for amd64 if applicable
+          if [[ "${ARCH}" == "amd64" ]]; then
+            SPECIAL_RUNNERS=$(jq '
+              .linux.special_runners.amd64
+            ' ci/test-matrix.json)
+            TEST_MATRIX=$(jq --argjson special "$SPECIAL_RUNNERS" '. + $special' <<< "$TEST_MATRIX")
+          fi
+
+          MATRIX="$(
+            jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end' <<< "$TEST_MATRIX"
+          )"
+
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+
+  test:
+    name: py${{ matrix.PY_VER }}, ${{ matrix.CUDA_VER }}, ${{ (matrix.LOCAL_CTK == '1' && 'local') || 'wheels' }}, ${{ matrix.GPU }}
+    needs: compute-matrix
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+    runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1"
+    # The build stage could fail but we want the CI to keep moving.
+    if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
+    # Our self-hosted runners require a container
+    container:
+      options: -u root --security-opt seccomp=unconfined --shm-size 16g
+      image: ubuntu:22.04
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    steps:
+      - name: Ensure GPU is working
+        run: nvidia-smi
+
+      - name: Checkout ${{ github.event.repository.name }}
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+
+      - name: Setup proxy cache
+        uses: nv-gha-runners/setup-proxy-cache@main
+        continue-on-error: true
+
+      - name: Install dependencies
+        uses: ./.github/actions/install_unix_deps
+        continue-on-error: false
+        with:
+          # for artifact fetching, graphics libs
+          dependencies: "jq wget libgl1 libegl1"
+          dependent_exes: "jq wget"
+
+      - name: Set environment variables
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          HOST_PLATFORM: ${{ inputs.host-platform }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+          PY_VER: ${{ matrix.PY_VER }}
+          SHA: ${{ github.sha }}
+        run: ./ci/tools/env-vars test
+
+      - name: Download numba-cuda build artifacts
+        if: ${{ env.SKIP_NUMBA_CUDA_TEST == '0'}}
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0  # v5.0.0
+        with:
+          name: ${{ env.NUMBA_CUDA_ARTIFACT_NAME }}
+          path: ${{ env.NUMBA_CUDA_ARTIFACTS_DIR }}
+
+      - name: Display structure of downloaded numba-cuda artifacts
+        run: |
+          pwd
+          ls -lahR $NUMBA_CUDA_ARTIFACTS_DIR
+
+      - name: Download numba-cuda test artifacts
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0  # v5.0.0
+        with:
+          name: ${{ env.NUMBA_CUDA_TEST_ARTIFACT_NAME }}-cu${{ env.TEST_CUDA_MAJOR }}
+          path: testing/
+
+      - name: Display structure of downloaded numba-cuda test artifacts
+        run: |
+          pwd
+          ls -lahR testing/
+
+      - name: Set up Python ${{ matrix.PY_VER }}
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
+        with:
+          python-version: ${{ matrix.PY_VER }}
+        env:
+          # we use self-hosted runners on which setup-python behaves weirdly...
+          AGENT_TOOLSDIRECTORY: "/opt/hostedtoolcache"
+
+      - name: Set up mini CTK
+        if: ${{ matrix.LOCAL_CTK == '1' }}
+        uses: ./.github/actions/fetch_ctk
+        continue-on-error: false
+        with:
+          host-platform: ${{ inputs.host-platform }}
+          cuda-version: ${{ matrix.CUDA_VER }}
+          cuda-components: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_cccl,libnvjitlink,cuda_cuobjdump"
+
+#      - name: Set up latest cuda_sanitizer_api
+#        if: ${{ env.SETUP_SANITIZER == '1' }}
+#        uses: ./.github/actions/fetch_ctk
+#        continue-on-error: false
+#        with:
+#          host-platform: ${{ inputs.host-platform }}
+#          cuda-version: ${{ env.LATEST_CUDA_VERSION }}
+#          cuda-components: "cuda_sanitizer_api"
+#
+#      - name: Set up compute-sanitizer
+#        run: setup-sanitizer
+
+      - name: Run numba-cuda tests
+        if: ${{ env.SKIP_NUMBA_CUDA_TEST == '0' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        run: |
+          if [[ "${LOCAL_CTK}" != 1 ]]; then
+            export NUMBA_CUDA_TEST_WHEEL_ONLY=1
+          fi
+          run-tests
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
new file mode 100644
index 000000000..3588dbca1
--- /dev/null
+++ b/.github/workflows/test-wheel-windows.yml
@@ -0,0 +1,155 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: "CI: Test wheels"
+
+on:
+  workflow_call:
+    inputs:
+      build-type:
+        type: string
+        required: true
+      host-platform:
+        type: string
+        required: true
+      matrix_filter:
+        type: string
+        default: "."
+
+jobs:
+  compute-matrix:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+    env:
+      BUILD_TYPE: ${{ inputs.build-type }}
+      ARCH: ${{ (inputs.host-platform == 'win-64' && 'amd64') }}
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Checkout ${{ github.event.repository.name }}
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+
+      - name: Validate Test Type
+        run: |
+          if [[ "$BUILD_TYPE" != "pull-request" ]] && [[ "$BUILD_TYPE" != "nightly" ]] && [[ "$BUILD_TYPE" != "branch" ]]; then
+              echo "Invalid build type! Must be one of 'nightly', 'pull-request', or 'branch'."
+              exit 1
+          fi
+      - name: Compute Python Test Matrix
+        id: compute-matrix
+        run: |
+          # Use the nightly matrix for branch tests
+          MATRIX_TYPE="${BUILD_TYPE}"
+          if [[ "${MATRIX_TYPE}" == "branch" ]]; then
+            MATRIX_TYPE="nightly"
+          fi
+
+          # Read base matrix from JSON file for the specific architecture
+          TEST_MATRIX=$(jq --arg arch "$ARCH" --arg matrix_type "$MATRIX_TYPE" '
+            .windows[$matrix_type] |
+            map(select(.ARCH == $arch))
+          ' ci/test-matrix.json)
+
+          MATRIX="$(
+            jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end' <<< "$TEST_MATRIX"
+          )"
+
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+
+  test:
+    name: py${{ matrix.PY_VER }}, ${{ matrix.CUDA_VER }}, ${{ (matrix.LOCAL_CTK == '1' && 'local') || 'wheels' }}, ${{ matrix.GPU }} (${{ matrix.DRIVER_MODE }})
+    # The build stage could fail but we want the CI to keep moving.
+    needs: compute-matrix
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+    if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
+    runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1"
+    steps:
+      - name: Checkout ${{ github.event.repository.name }}
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+
+      - name: Setup proxy cache
+        uses: nv-gha-runners/setup-proxy-cache@main
+        continue-on-error: true
+
+      - name: Update driver
+        env:
+          DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
+          GPU_TYPE: ${{ matrix.GPU }}
+        run: |
+          ci/tools/install_gpu_driver.ps1
+
+      - name: Ensure GPU is working
+        run: |
+          nvidia-smi
+
+          $mode_output = nvidia-smi | Select-String -Pattern "${{ matrix.DRIVER_MODE }}"
+          Write-Output "Driver mode check: $mode_output"
+          if ("$mode_output" -eq "") {
+            Write-Error "Switching to driver mode ${{ matrix.DRIVER_MODE }} failed!"
+            exit 1
+          }
+          Write-Output "Driver mode verified: ${{ matrix.DRIVER_MODE }}"
+
+      - name: Set environment variables
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          HOST_PLATFORM: ${{ inputs.host-platform }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+          PY_VER: ${{ matrix.PY_VER }}
+          SHA: ${{ github.sha }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: ./ci/tools/env-vars test
+
+      - name: Download numba-cuda build artifacts
+        if: ${{ env.SKIP_NUMBA_CUDA_TEST == '0'}}
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0  # v5.0.0
+        with:
+          name: ${{ env.NUMBA_CUDA_ARTIFACT_NAME }}
+          path: ${{ env.NUMBA_CUDA_ARTIFACTS_DIR }}
+
+      - name: Display structure of downloaded numba-cuda artifacts
+        run: |
+          Get-Location
+          Get-ChildItem -Recurse -Force $env:NUMBA_CUDA_ARTIFACTS_DIR | Select-Object Mode, LastWriteTime, Length, FullName
+
+      - name: Download numba-cuda test artifacts
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0  # v5.0.0
+        with:
+          name: ${{ env.NUMBA_CUDA_TEST_ARTIFACT_NAME }}-cu${{ env.TEST_CUDA_MAJOR }}
+          path: testing/
+
+      - name: Display structure of downloaded numba-cuda test artifacts
+        run: |
+          Get-Location
+          Get-ChildItem -Recurse -Force testing/ | Select-Object Mode, LastWriteTime, Length, FullName
+
+      - name: Set up Python ${{ matrix.PY_VER }}
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
+        with:
+          python-version: ${{ matrix.PY_VER }}
+
+      - name: Set up mini CTK
+        if: ${{ matrix.LOCAL_CTK == '1' }}
+        uses: ./.github/actions/fetch_ctk
+        continue-on-error: false
+        with:
+          host-platform: ${{ inputs.host-platform }}
+          cuda-version: ${{ matrix.CUDA_VER }}
+          cuda-components: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_cccl,libnvjitlink,cuda_cuobjdump"
+
+      - name: Run numba-cuda tests
+        if: ${{ env.SKIP_NUMBA_CUDA_TEST == '0' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          if [[ "${LOCAL_CTK}" != 1 ]]; then
+            export NUMBA_CUDA_TEST_WHEEL_ONLY=1
+          fi
+          run-tests
diff --git a/ci/test-matrix.json b/ci/test-matrix.json
new file mode 100644
index 000000000..6f3ad6944
--- /dev/null
+++ b/ci/test-matrix.json
@@ -0,0 +1,44 @@
+{
+  "_description": "Test matrix configurations for CUDA Python CI workflows. This file consolidates the test matrices that were previously hardcoded in the workflow files. All GPU and ARCH values are hard-coded for each architecture: l4 GPU for amd64, a100 GPU for arm64.",
+  "_sorted_by": "Please keep matrices sorted in ascending order by [ARCH, PY_VER, CUDA_VER, LOCAL_CTK, GPU, DRIVER]. Windows entries also include DRIVER_MODE.",
+  "_notes": "DRIVER: 'earliest' does not work with CUDA 12.9.1 and LOCAL_CTK: 0 does not work with CUDA 12.0.1",
+  "linux": {
+    "pull-request": [
+      { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "v100", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.11", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "rtxpro6000", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.11", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "v100", "DRIVER": "latest" },
+      { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "rtxpro6000", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.11", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.11", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
+      { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
+    ],
+    "nightly": [],
+    "special_runners": {
+      "amd64": [
+        { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "H100", "DRIVER": "latest" }
+      ]
+    }
+  },
+  "windows": {
+    "pull-request": [
+      { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "rtx2080", "DRIVER": "latest", "DRIVER_MODE": "WDDM" },
+      { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "rtxpro6000", "DRIVER": "latest", "DRIVER_MODE": "TCC" },
+      { "ARCH": "amd64", "PY_VER": "3.11", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "v100", "DRIVER": "latest", "DRIVER_MODE": "MCDM" },
+      { "ARCH": "amd64", "PY_VER": "3.11", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "rtx4090", "DRIVER": "latest", "DRIVER_MODE": "WDDM" },
+      { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest", "DRIVER_MODE": "MCDM" },
+      { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest", "DRIVER_MODE": "TCC" },
+      { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest", "DRIVER_MODE": "TCC" },
+      { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "rtxpro6000", "DRIVER": "latest", "DRIVER_MODE": "MCDM" }
+    ],
+    "nightly": []
+  }
+}
diff --git a/ci/tools/download-wheels b/ci/tools/download-wheels
new file mode 100755
index 000000000..8081966c0
--- /dev/null
+++ b/ci/tools/download-wheels
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# A utility script to download component wheels from GitHub Actions artifacts.
+# This script reuses the same logic that was in release.yml to maintain consistency.
+
+set -euo pipefail
+
+# Check required arguments
+if [[ $# -lt 3 ]]; then
+    echo "Usage: $0 <run-id> <component> <repository> [output-dir]" >&2
+    echo "  run-id: The GitHub Actions run ID containing the artifacts" >&2
+    echo "  component: The component name pattern to download (e.g., cuda-core, cuda-bindings)" >&2
+    echo "  repository: The GitHub repository (e.g., NVIDIA/cuda-python)" >&2
+    echo "  output-dir: Optional output directory (default: ./dist)" >&2
+    exit 1
+fi
+
+RUN_ID="$1"
+COMPONENT="$2"
+REPOSITORY="$3"
+OUTPUT_DIR="${4:-./dist}"
+
+# Ensure we have a GitHub token
+if [[ -z "${GH_TOKEN:-}" ]]; then
+    echo "Error: GH_TOKEN environment variable is required"
+    exit 1
+fi
+
+echo "Downloading wheels for component: $COMPONENT from run: $RUN_ID"
+
+# Download component wheels using the same logic as release.yml
+if [[ "$COMPONENT" == "all" ]]; then
+    # Download all component patterns
+    gh run download "$RUN_ID" -p "numba*" -R "$REPOSITORY"
+else
+    gh run download "$RUN_ID" -p "${COMPONENT}*" -R "$REPOSITORY"
+fi
+
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+
+# Process downloaded artifacts
+for p in numba*
+do
+    if [[ ! -d "$p" ]]; then
+        continue
+    fi
+
+    # exclude cython test artifacts
+    if [[ "${p}" == *-tests ]]; then
+        echo "Skipping test artifact: $p"
+        continue
+    fi
+
+    # If we're not downloading "all", only process matching component
+    if [[ "$COMPONENT" != "all" && "$p" != ${COMPONENT}* ]]; then
+        continue
+    fi
+
+    echo "Processing artifact: $p"
+    # Move wheel files to output directory
+    if [[ -d "$p" ]]; then
+        find "$p" -name "*.whl" -exec mv {} "$OUTPUT_DIR/" \;
+    fi
+done
+
+# Clean up artifact directories
+rm -rf numba*
+
+echo "Downloaded wheels to: $OUTPUT_DIR"
+ls -la "$OUTPUT_DIR"
diff --git a/ci/tools/env-vars b/ci/tools/env-vars
new file mode 100755
index 000000000..83f1145a1
--- /dev/null
+++ b/ci/tools/env-vars
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# A utility script to set up the GitHub environment variables for the CI.
+
+set -euo pipefail
+
+# Check if the script was called with exactly 1 argument
+if [[ ${#} -ne 1 ]]; then
+  echo "Error: This script requires exactly 1 argument (the build mode). You provided ${#}"
+  echo "Usage: ${0} build_mode[build or test]"
+  exit 1
+fi
+
+PYTHON_VERSION_FORMATTED=$(echo "${PY_VER}" | tr -d '.')
+
+if [[ "${HOST_PLATFORM}" == linux* ]]; then
+  REPO_DIR=$(pwd)
+  TOOLS_PATH="${REPO_DIR}/ci/tools"
+elif [[ "${HOST_PLATFORM}" == win* ]]; then
+  PWD=$(pwd)
+  REPO_DIR=$(cygpath -w ${PWD})
+  TOOLS_PATH=$(cygpath -w ${PWD}/ci/tools)
+fi
+
+echo "${TOOLS_PATH}" >> $GITHUB_PATH
+{
+  echo "PYTHON_VERSION_FORMATTED=${PYTHON_VERSION_FORMATTED}"
+} >> $GITHUB_ENV
+
+if [[ "${1}" == "build" ]]; then
+  # platform is handled by the default value of platform (`auto`) in cibuildwheel
+  # here we only need to specify the python version we want
+  echo "CIBW_BUILD=cp${PYTHON_VERSION_FORMATTED}-*" >> $GITHUB_ENV
+  NUMBA_CUDA_ARTIFACT_BASENAME="numba-cuda-python${PYTHON_VERSION_FORMATTED}-${HOST_PLATFORM}"
+  # Enforce an explicit cache dir so that we can reuse this path later
+  echo "SCCACHE_DIR=${HOME}/.cache/sccache" >> $GITHUB_ENV
+  echo "SCCACHE_CACHE_SIZE=1G" >> $GITHUB_ENV
+elif [[ "${1}" == "test" ]]; then
+  TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
+  TEST_CUDA_MINOR="$(cut -d '.' -f 2 <<< ${CUDA_VER})"
+  NUMBA_CUDA_ARTIFACT_BASENAME="numba-cuda-python${PYTHON_VERSION_FORMATTED}-${HOST_PLATFORM}"
+#  # We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort
+#  # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix
+#  # Only local ctk installs have compute-sanitizer; there is no wheel for it
+#  if [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then
+#    echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh $TEST_CUDA_MAJOR)" >> $GITHUB_ENV
+#    SETUP_SANITIZER=1
+#  else
+#    SETUP_SANITIZER=0
+#    echo "SANITIZER_CMD=" >> $GITHUB_ENV
+#  fi
+  {
+#    echo "SETUP_SANITIZER=${SETUP_SANITIZER}"
+#    echo "SKIP_NUMBA_CUDA_TEST=${SKIP_NUMBA_CUDA_TEST}"
+    echo "SANITIZER_CMD="
+    echo "TEST_CUDA_MAJOR=${TEST_CUDA_MAJOR}"
+    echo "TEST_CUDA_MINOR=${TEST_CUDA_MINOR}"
+  } >> $GITHUB_ENV
+fi
+
+{
+  echo "NUMBA_CUDA_ARTIFACT_BASENAME=${NUMBA_CUDA_ARTIFACT_BASENAME}"
+  echo "NUMBA_CUDA_ARTIFACT_NAME=${NUMBA_CUDA_ARTIFACT_BASENAME}-${SHA}"
+  echo "NUMBA_CUDA_TEST_ARTIFACT_NAME=numba-cuda-${HOST_PLATFORM}-${SHA}-test"
+  echo "NUMBA_CUDA_ARTIFACTS_DIR=$(realpath "${REPO_DIR}/dist")"
+} >> $GITHUB_ENV
diff --git a/ci/tools/install_gpu_driver.ps1 b/ci/tools/install_gpu_driver.ps1
new file mode 100644
index 000000000..5602eeb48
--- /dev/null
+++ b/ci/tools/install_gpu_driver.ps1
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Install the driver
+function Install-Driver {
+
+    # Set the correct URL, filename, and arguments to the installer
+    # This driver is picked to support Windows 11 & CUDA 13.0
+    $version = '581.15'
+
+    # Get GPU type from environment variable
+    $gpu_type = $env:GPU_TYPE
+
+    $data_center_gpus = @('a100', 'h100', 'l4', 't4', 'v100', 'rtxa6000', 'rtx6000ada')
+    $desktop_gpus = @('rtx2080', 'rtx4090', 'rtxpro6000')
+
+    if ($data_center_gpus -contains $gpu_type) {
+        Write-Output "Data center GPU detected: $gpu_type"
+        $filename="$version-data-center-tesla-desktop-winserver-2022-2025-dch-international.exe"
+        $server_path="tesla/$version"
+    } elseif ($desktop_gpus -contains $gpu_type) {
+        Write-Output "Desktop GPU detected: $gpu_type"
+        $filename="$version-desktop-win10-win11-64bit-international-dch-whql.exe"
+        $server_path="Windows/$version"
+    } else {
+        Write-Output "Unknown GPU type: $gpu_type"
+        exit 1
+    }
+
+    $url="https://us.download.nvidia.com/$server_path/$filename"
+    $filepath="C:\NVIDIA-Driver\$filename"
+
+    Write-Output "Installing NVIDIA driver version $version for GPU type $gpu_type"
+    Write-Output "Download URL: $url"
+
+    # Silent install arguments
+    $install_args = '/s /noeula /noreboot';
+
+    # Create the folder for the driver download
+    if (!(Test-Path -Path 'C:\NVIDIA-Driver')) {
+        New-Item -Path 'C:\' -Name 'NVIDIA-Driver' -ItemType 'directory' | Out-Null
+    }
+
+    # Download the file to a specified directory
+    # Disabling progress bar due to https://github.com/GoogleCloudPlatform/compute-gpu-installation/issues/29
+    $ProgressPreference_tmp = $ProgressPreference
+    $ProgressPreference = 'SilentlyContinue'
+    Write-Output 'Downloading the driver installer...'
+    Invoke-WebRequest $url -OutFile $filepath
+    $ProgressPreference = $ProgressPreference_tmp
+    Write-Output 'Download complete!'
+
+    # Install the file with the specified path from earlier
+    Write-Output 'Running the driver installer...'
+    Start-Process -FilePath $filepath -ArgumentList $install_args -Wait
+    Write-Output 'Done!'
+
+    # Handle driver mode configuration
+    # This assumes we have the prior knowledge on which GPU can use which mode.
+    $driver_mode = $env:DRIVER_MODE
+    if ($driver_mode -eq "WDDM") {
+        Write-Output "Setting driver mode to WDDM..."
+        nvidia-smi -fdm 0
+    } elseif ($driver_mode -eq "TCC") {
+        Write-Output "Setting driver mode to TCC..."
+        nvidia-smi -fdm 1
+    } elseif ($driver_mode -eq "MCDM") {
+        Write-Output "Setting driver mode to MCDM..."
+        nvidia-smi -fdm 2
+    } else {
+        Write-Output "Unknown driver mode: $driver_mode"
+        exit 1
+    }
+    pnputil /disable-device /class Display
+    pnputil /enable-device /class Display
+    # Give it a minute to settle:
+    Start-Sleep -Seconds 5
+}
+
+# Run the functions
+Install-Driver
diff --git a/ci/tools/lookup-run-id b/ci/tools/lookup-run-id
new file mode 100755
index 000000000..db2f84b79
--- /dev/null
+++ b/ci/tools/lookup-run-id
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# A utility script to find the GitHub Actions workflow run ID for a given git tag.
+# This script looks for the CI workflow run that corresponds to the commit of the given tag.
+
+set -euo pipefail
+
+# Check required arguments
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <git-tag> <repository> [workflow-name]" >&2
+    echo "  git-tag: The git tag to find the corresponding workflow run for" >&2
+    echo "  repository: The GitHub repository (e.g., NVIDIA/cuda-python)" >&2
+    echo "  workflow-name: Optional workflow name to filter by (default: CI)" >&2
+    echo "" >&2
+    echo "Examples:" >&2
+    echo "  $0 v13.0.1 NVIDIA/cuda-python" >&2
+    echo "  $0 v13.0.1 NVIDIA/cuda-python \"CI\"" >&2
+    exit 1
+fi
+
+GIT_TAG="${1}"
+REPOSITORY="${2}"
+WORKFLOW_NAME="${3:-CI}"
+
+# Ensure we have required tools
+if [[ -z "${GH_TOKEN:-}" ]]; then
+    echo "Error: GH_TOKEN environment variable is required" >&2
+    exit 1
+fi
+
+if ! command -v jq >/dev/null 2>&1; then
+    echo "Error: jq is required but not installed" >&2
+    exit 1
+fi
+
+if ! command -v gh >/dev/null 2>&1; then
+    echo "Error: GitHub CLI (gh) is required but not installed" >&2
+    exit 1
+fi
+
+echo "Looking up run ID for tag: ${GIT_TAG} in repository: ${REPOSITORY}" >&2
+
+# Resolve git tag to commit SHA
+if ! COMMIT_SHA=$(git rev-parse "${GIT_TAG}"); then
+    echo "Error: Could not resolve git tag '${GIT_TAG}' to a commit SHA" >&2
+    echo "Make sure the tag exists and you have fetched it" >&2
+    exit 1
+fi
+
+echo "Resolved tag '${GIT_TAG}' to commit: ${COMMIT_SHA}" >&2
+
+# Find workflow runs for this commit
+echo "Searching for '${WORKFLOW_NAME}' workflow runs for commit: ${COMMIT_SHA}" >&2
+
+# Get workflow runs for the commit, filter by workflow name and successful status
+RUN_DATA=$(gh run list \
+    --repo "${REPOSITORY}" \
+    --commit "${COMMIT_SHA}" \
+    --workflow "${WORKFLOW_NAME}" \
+    --status completed \
+    --json databaseId,workflowName,status,conclusion,headSha \
+    --limit 10)
+
+if [[ -z "${RUN_DATA}" || "${RUN_DATA}" == "[]" ]]; then
+    echo "Error: No completed '${WORKFLOW_NAME}' workflow runs found for commit ${COMMIT_SHA}" >&2
+    echo "Available workflow runs for this commit:" >&2
+    gh run list --repo "${REPOSITORY}" --commit "${COMMIT_SHA}" --limit 10 || true
+    exit 1
+fi
+
+# Filter for successful runs (conclusion = success) and extract the run ID from the first one
+RUN_ID=$(echo "${RUN_DATA}" | jq -r '.[] | select(.conclusion == "success") | .databaseId' | head -1)
+
+if [[ -z "${RUN_ID}" || "${RUN_ID}" == "null" ]]; then
+    echo "Error: No successful '${WORKFLOW_NAME}' workflow runs found for commit ${COMMIT_SHA}" >&2
+    echo "Available workflow runs for this commit:" >&2
+    gh run list --repo "$REPOSITORY" --commit "${COMMIT_SHA}" --limit 10 || true
+    echo "" >&2
+    echo "Completed runs with their conclusions:" >&2
+    echo "${RUN_DATA}" | jq -r '.[] | "\(.databaseId): \(.conclusion)"' >&2
+    exit 1
+fi
+
+echo "Found workflow run ID: ${RUN_ID} for tag '${GIT_TAG}'" >&2
+
+# Verify the run has the expected artifacts by checking if there are any artifacts
+echo "Verifying artifacts exist for run ${RUN_ID}..." >&2
+ARTIFACT_LIST=$(gh run view "${RUN_ID}" --repo "${REPOSITORY}" --json url || echo "")
+
+if [[ -z "${ARTIFACT_LIST}" ]]; then
+    echo "Warning: Could not verify artifacts for workflow run ${RUN_ID}" >&2
+fi
+
+# Output the run ID (this is what gets used by calling scripts)
+echo "${RUN_ID}"
diff --git a/ci/tools/run-tests b/ci/tools/run-tests
new file mode 100755
index 000000000..59eb1055e
--- /dev/null
+++ b/ci/tools/run-tests
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# A utility script to install the correct packages and run the tests.
+
+set -euo pipefail
+
+echo "Installing numba-cuda wheel"
+if [[ "${LOCAL_CTK}" == 1 ]]; then
+  pip install "${NUMBA_CUDA_ARTIFACTS_DIR}"/*.whl "cuda-bindings==${TEST_CUDA_MAJOR}.*" --group test
+else
+  pip install $(ls "${NUMBA_CUDA_ARTIFACTS_DIR}"/*.whl)["cu${TEST_CUDA_MAJOR}"] "cuda-toolkit==${TEST_CUDA_MAJOR}.${TEST_CUDA_MINOR}.*" --group test
+fi
+echo "Running numba-cuda tests"
+export NUMBA_CUDA_TEST_BIN_DIR=`pwd`/testing
+pushd $NUMBA_CUDA_TEST_BIN_DIR
+GPU_CC=$(nvidia-smi --query-gpu=compute_cap --format=csv | grep -v compute_cap | head -n 1 | sed 's/\.//')
+mv cu${TEST_CUDA_MAJOR}_cc${GPU_CC}/* .
+${SANITIZER_CMD} pytest -rxXs -v
diff --git a/ci/versions.json b/ci/versions.json
new file mode 100644
index 000000000..32b869833
--- /dev/null
+++ b/ci/versions.json
@@ -0,0 +1,10 @@
+{
+  "cuda": {
+    "build": {
+      "version": "13.0.2"
+    },
+    "prev_build": {
+      "version": "12.9.1"
+    }
+  }
+}
diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
index d0ff4ba55..9f887a2ba 100644
--- a/numba_cuda/numba/cuda/__init__.py
+++ b/numba_cuda/numba/cuda/__init__.py
@@ -1,6 +1,8 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+# delvewheel: patch
+
 import importlib
 from numba.cuda.core import config
 from .utils import _readenv
@@ -23,7 +25,8 @@
 ):
     raise ImportError(
         "NVIDIA CUDA Python bindings not found. Install the 'cuda' package "
-        "(e.g. pip install nvidia-cuda-python or numba-cuda[cuXY])."
+        '(e.g. pip install "cuda-bindings==XY.*" or "numba-cuda[cuXY]", '
+        "with XY=12 or XY=13)."
     )
 
 if config.ENABLE_CUDASIM:
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 4838ce0e0..ff51db4f1 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -43,6 +43,12 @@
         TEST_BIN_DIR, "test_device_functions.ltoir"
     )
 
+    require_cuobjdump = (
+        test_device_functions_fatbin_multi,
+        test_device_functions_fatbin,
+        test_device_functions_o,
+    )
+
 
 @unittest.skipIf(
     not TEST_BIN_DIR or not _have_nvjitlink(),
@@ -127,14 +133,22 @@ def tearDown(self):
         super().tearDown()
 
     def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
-        files = [
+        files = (
             test_device_functions_cu,
             test_device_functions_ltoir,
             test_device_functions_fatbin_multi,
-        ]
+        )
 
         for file in files:
             with self.subTest(file=file):
+                if (
+                    file in require_cuobjdump
+                    and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
+                ):
+                    self.skipTest(
+                        "wheel-only environments do not have cuobjdump"
+                    )
+
                 f = io.StringIO()
                 with contextlib.redirect_stdout(f):
                     sig = "uint32(uint32, uint32)"
@@ -151,16 +165,24 @@ def kernel(result):
                 self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
 
     def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
-        files = [
+        files = (
             test_device_functions_a,
             test_device_functions_cubin,
             test_device_functions_fatbin,
             test_device_functions_o,
             test_device_functions_ptx,
-        ]
+        )
 
         for file in files:
             with self.subTest(file=file):
+                if (
+                    file in require_cuobjdump
+                    and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
+                ):
+                    self.skipTest(
+                        "wheel-only environments do not have cuobjdump"
+                    )
+
                 sig = "uint32(uint32, uint32)"
                 add_from_numba = cuda.declare_device("add_from_numba", sig)
 
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py b/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py
index e7d30dc2c..2de769670 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py
@@ -592,6 +592,12 @@ def atomic_cas_2dim(res, old, ary, fill_val):
         old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
 
 
+@unittest.skipIf(
+    not config.ENABLE_CUDASIM
+    and cuda.get_current_device().compute_capability >= (12, 0)
+    and cuda.cudadrv.runtime.get_version()[0] == 12,
+    reason="NVVM 12.9 Bugged on CC 10+",
+)
 class TestCudaAtomics(CUDATestCase):
     def setUp(self):
         super().setUp()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_complex.py b/numba_cuda/numba/cuda/tests/cudapy/test_complex.py
index 2437c9ace..d793f5604 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_complex.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_complex.py
@@ -9,6 +9,7 @@
 from numba.cuda.testing import unittest, CUDATestCase
 from numba.cuda import types
 from numba import cuda
+from numba.cuda import config
 from numba.cuda.tests.cudapy.complex_usecases import (
     real_usecase,
     imag_usecase,
@@ -333,6 +334,12 @@ def test_tanh(self):
         self.check_unary_func(tanh_usecase, ulps=2, ignore_sign_on_zero=True)
 
 
+@unittest.skipIf(
+    not config.ENABLE_CUDASIM
+    and cuda.get_current_device().compute_capability >= (12, 0)
+    and cuda.cudadrv.runtime.get_version()[0] == 12,
+    reason="NVVM 12.9 Bugged on CC 10+",
+)
 class TestAtomicOnComplexComponents(CUDATestCase):
     # Based on the reproducer from Issue #8309. array.real and array.imag could
     # not be used because they required returning an array from a generated
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py b/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py
index f69ab496d..d86b9c35b 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py
@@ -287,16 +287,17 @@ def use_vote_sync_all_with_mask(mask, predicate, result):
         valid_cases = [
             # mask: unsigned/signed integer
             # predicate: unsigned/signed integer, boolean
-            ("void(uint32[:], uint32[:], int32[:])", np.uint32, np.uint32, 1),
-            ("void(int64[:], int64[:], int32[:])", np.int64, np.int64, 1),
-            ("void(uint64[:], uint64[:], int32[:])", np.uint64, np.uint64, 1),
-            ("void(int32[:], int32[:], int32[:])", np.int32, np.int32, 1),
-            ("void(uint32[:], boolean[:], int32[:])", np.uint32, np.bool_, 1),
-            ("void(uint64[:], boolean[:], int32[:])", np.uint64, np.bool_, 1),
+            ("void(uint32[:], uint32[:], int32[:])", np.uint32, np.uint32),
+            ("void(int64[:], int64[:], int32[:])", np.int64, np.int64),
+            ("void(uint64[:], uint64[:], int32[:])", np.uint64, np.uint64),
+            ("void(int32[:], int32[:], int32[:])", np.int32, np.int32),
+            ("void(uint32[:], boolean[:], int32[:])", np.uint32, np.bool_),
+            ("void(uint64[:], boolean[:], int32[:])", np.uint64, np.bool_),
         ]
 
-        for sig, mask_dtype, pred_dtype, mask_val in valid_cases:
+        for sig, mask_dtype, pred_dtype in valid_cases:
             with self.subTest(sig=sig):
+                mask_val = (~np.array(0, dtype=mask_dtype)).item()
                 compiled = cuda.jit(sig)(use_vote_sync_all_with_mask)
                 ary_mask = np.full(nelem, mask_val, dtype=mask_dtype)
                 ary_pred = np.ones(nelem, dtype=pred_dtype)
diff --git a/pixi.lock b/pixi.lock
index 29ea8ddc6..6f5167fd0 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -15430,7 +15430,7 @@ packages:
   - numpy >=1.21,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15453,7 +15453,7 @@ packages:
   - numpy >=1.21,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15476,7 +15476,7 @@ packages:
   - numpy >=1.21,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15499,7 +15499,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15522,7 +15522,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15545,7 +15545,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15568,7 +15568,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15591,7 +15591,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15614,7 +15614,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15637,7 +15637,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15660,7 +15660,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - conda: .
@@ -15683,7 +15683,7 @@ packages:
   - numpy >=1.23,<3
   license: BSD-2-Clause
   input:
-    hash: 374ed0f53cec9900fe88055c53fd85b4bb401a28b0f2e81241682223da95fed2
+    hash: f4f870026faa6c5b05f9ae3b9ddcb8500d569eeafd6477b27702c588535c3418
     globs:
     - pyproject.toml
 - pypi: https://files.pythonhosted.org/packages/5e/a6/9ca0eecc489640615642a6cbc0ca9e10df70df38c4d43f5a928ff18d8827/numpy-2.3.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl
diff --git a/pyproject.toml b/pyproject.toml
index 42e83bf8d..f49a8c2dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,23 +27,13 @@ dependencies = ["numba>=0.60.0", "cuda-bindings>=12.9.1,<14.0.0", "cuda-core>=0.
 cu12 = [
     "cuda-bindings>=12.9.1,<13.0.0",
     "cuda-core>=0.3.0,<1.0.0",
-    "cuda-python==12.9.*",  # supports all CTK 12.x
-    "nvidia-cuda-nvcc-cu12",  # for libNVVM
-    "nvidia-cuda-runtime-cu12",
-    "nvidia-cuda-nvrtc-cu12",
-    "nvidia-nvjitlink-cu12",
-    "nvidia-cuda-cccl-cu12",
+    # install nvcc for libNVVM
+    "cuda-toolkit[cudart,nvcc,nvrtc,nvjitlink,cccl]==12.*",
 ]
-# TODO: Use cuda-toolkit package dependencies - e.g. cuda-toolkit[curand,nvvm,nvrtc]=13.*
 cu13 = [
     "cuda-bindings==13.*",
     "cuda-core>=0.3.2,<1.0.0",
-    "cuda-python==13.*",
-    "nvidia-nvvm==13.*",
-    "nvidia-cuda-runtime==13.*",
-    "nvidia-cuda-nvrtc==13.*",
-    "nvidia-nvjitlink==13.*",
-    "nvidia-cuda-cccl==13.*",
+    "cuda-toolkit[cudart,nvvm,nvrtc,nvjitlink,cccl]==13.*",
 ]
 
 [dependency-groups]
@@ -156,3 +146,24 @@ exclude = [
 
 [tool.pyrefly]
 search-path = ["./numba_cuda"]
+
+[tool.cibuildwheel]
+skip = "*-musllinux_*"
+enable = "cpython-freethreading"
+build-verbosity = 1
+
+[tool.cibuildwheel.linux]
+archs = "native"
+before-build = "pip install twine"
+repair-wheel-command = [
+    "auditwheel repair -w {dest_dir} {wheel}",
+    "twine check --strict {dest_dir}/*",
+]
+
+[tool.cibuildwheel.windows]
+archs = "AMD64"
+before-build = "pip install delvewheel twine"
+repair-wheel-command = [
+    "delvewheel repair --custom-patch -w {dest_dir} {wheel}",
+    "twine check --strict {dest_dir}/*",
+]
diff --git a/testing/Makefile b/testing/Makefile
index 2d9c0d138..be015e962 100644
--- a/testing/Makefile
+++ b/testing/Makefile
@@ -31,9 +31,16 @@ MULTI_FATBIN_GENCODE := $(MULTI_GENCODE) -gencode arch=compute_$(ALT_CC),code=[s
 # LTO-IR tests need to generate for the LTO "architecture" instead
 LTOIR_GENCODE := -gencode arch=lto_$(GPU_CC),code=lto_$(GPU_CC)
 
+# In CI we use sccache. Note that sccache does not support generating fatbin or ltoir.
+ifeq ($(shell command -v sccache 2>&1 >/dev/null; echo $$?),0)
+    SCCACHE := sccache
+else
+    SCCACHE :=
+endif
+
 # Compile with optimization; use relocatable device code to preserve device
 # functions in the final output
-NVCC_FLAGS := -O3 -rdc true
+NVCC_FLAGS := -O3 -rdc true -std=c++17
 
 # Flags specific to output type
 CUBIN_FLAGS := $(GENCODE) --cubin
@@ -46,13 +53,13 @@ LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
 
 OUTPUT_DIR := ./
 
-NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.memory_management.nrt import get_include; print(get_include())")
+NRT_INCLUDE_DIR := "$(shell python -c "from numba.cuda.memory_management.nrt import get_include; print(get_include())")"
 
 $(OUTPUT_DIR)/undefined_extern.cubin: undefined_extern.cu
-	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $@ $<
+	$(SCCACHE) nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $@ $<
 
 $(OUTPUT_DIR)/test_device_functions.cubin: test_device_functions.cu
-	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $@ $<
+	$(SCCACHE) nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $@ $<
 
 $(OUTPUT_DIR)/test_device_functions.fatbin: test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $@ $<
@@ -61,10 +68,10 @@ $(OUTPUT_DIR)/test_device_functions_multi.fatbin: test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $@ $<
 
 $(OUTPUT_DIR)/test_device_functions.ptx: test_device_functions.cu
-	nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $@ $<
+	$(SCCACHE) nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $@ $<
 
 $(OUTPUT_DIR)/test_device_functions.o: test_device_functions.cu
-	nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $@ $<
+	$(SCCACHE) nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $@ $<
 
 $(OUTPUT_DIR)/test_device_functions.a: test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $@ $<
@@ -86,7 +93,7 @@ test_device_functions: $(OUTPUT_DIR)/test_device_functions.cubin \
     $(OUTPUT_DIR)/test_device_functions.ltoir
 
 $(OUTPUT_DIR)/nrt_extern.cubin: nrt_extern.cu
-	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
+	$(SCCACHE) nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
 
 $(OUTPUT_DIR)/nrt_extern.fatbin: nrt_extern.cu
 	nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
@@ -95,10 +102,10 @@ $(OUTPUT_DIR)/nrt_extern_multi.fatbin: nrt_extern.cu
 	nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
 
 $(OUTPUT_DIR)/nrt_extern.ptx: nrt_extern.cu
-	nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
+	$(SCCACHE) nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
 
 $(OUTPUT_DIR)/nrt_extern.o: nrt_extern.cu
-	nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
+	$(SCCACHE) nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
 
 $(OUTPUT_DIR)/nrt_extern.a: nrt_extern.cu
 	nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $@ $< -I$(NRT_INCLUDE_DIR)
diff --git a/testing/generate_raw_ltoir.py b/testing/generate_raw_ltoir.py
index 104aeaeaf..a40ea7b27 100644
--- a/testing/generate_raw_ltoir.py
+++ b/testing/generate_raw_ltoir.py
@@ -105,6 +105,7 @@ def get_ltoir(source, name, arch):
         "-dlto",
         "-rdc",
         "true",
+        "-std=c++17",
         *cuda_include_flags,
     ]
     options = [o.encode() for o in options]
diff --git a/testing/pytest.ini b/testing/pytest.ini
index d1050355e..847e3f159 100644
--- a/testing/pytest.ini
+++ b/testing/pytest.ini
@@ -24,4 +24,5 @@ filterwarnings =
     ignore:\nCompilation is falling back to object mode WITHOUT looplifting enabled.*:numba.core.errors.NumbaWarning
     ignore:overflow encountered in scalar .+:RuntimeWarning
     ignore:.*Host array used in CUDA kernel will incur copy overhead.*:numba.cuda.core.errors.NumbaPerformanceWarning
+    ignore:NVRTC log messages.*Architectures prior to.*are deprecated.*:UserWarning
     ignore:Benchmark machine_info is different:pytest_benchmark.logger.PytestBenchmarkWarning