diff --git a/pkgs/development/compilers/cudatoolkit/common.nix b/pkgs/development/compilers/cudatoolkit/common.nix
index 2e15012452a80..617c2fd995808 100644
--- a/pkgs/development/compilers/cudatoolkit/common.nix
+++ b/pkgs/development/compilers/cudatoolkit/common.nix
@@ -24,6 +24,7 @@ args@
 , libkrb5
 , krb5
 , makeWrapper
+, markForCudatoolkitRootHook
 , ncurses5
 , numactl
 , nss
@@ -31,6 +32,7 @@ args@
 , python3 # FIXME: CUDAToolkit 10 may still need python27
 , pulseaudio
 , requireFile
+, setupCudaHook
 , stdenv
 , backendStdenv # E.g. gcc11Stdenv, set in extension.nix
 , unixODBC
@@ -80,11 +82,15 @@ backendStdenv.mkDerivation rec {
     addOpenGLRunpath
     autoPatchelfHook
     autoAddOpenGLRunpathHook
+    markForCudatoolkitRootHook
   ] ++ lib.optionals (lib.versionOlder version "11") [
     libsForQt5.wrapQtAppsHook
   ] ++ lib.optionals (lib.versionAtLeast version "11.8") [
     qt6Packages.wrapQtAppsHook
   ];
+  depsTargetTargetPropagated = [
+    setupCudaHook
+  ];
   buildInputs = lib.optionals (lib.versionOlder version "11") [
     libsForQt5.qt5.qtwebengine
     freeglut
@@ -280,24 +286,12 @@ backendStdenv.mkDerivation rec {
     sed -i "1 i#define _BITS_FLOATN_H" "$out/include/host_defines.h"
   '' +
   # Point NVCC at a compatible compiler
-  # FIXME: redist cuda_nvcc copy-pastes this code
-  # Refer to comments in the overrides for cuda_nvcc for explanation
   # CUDA_TOOLKIT_ROOT_DIR is legacy,
   # Cf. https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
-  # NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the compiled
-  #   binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
-  #   the default set of CUDA capabilities we build can regularly cause this to occur (for
-  #   example, with Magma).
   ''
     mkdir -p $out/nix-support
     cat <<EOF >> $out/nix-support/setup-hook
     cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
-    cmakeFlags+=' -DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
-    cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
-    if [ -z "\''${CUDAHOSTCXX-}" ]; then
-      export CUDAHOSTCXX=${backendStdenv.cc}/bin;
-    fi
-    export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${backendStdenv.cc}/bin -Xfatbin=-compress-all'
     EOF
 
     # Move some libraries to the lib output so that programs that
diff --git a/pkgs/development/compilers/cudatoolkit/extension.nix b/pkgs/development/compilers/cudatoolkit/extension.nix
index f14a55aa6cf71..93800a0dbc6b1 100644
--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@@ -38,10 +38,39 @@ final: prev: let
 
   cudaFlags = final.callPackage ./flags.nix {};
 
+  # Internal hook, used by cudatoolkit and cuda redist packages
+  # to accommodate automatic CUDAToolkit_ROOT construction
+  markForCudatoolkitRootHook = (final.callPackage
+    ({ makeSetupHook }:
+      makeSetupHook
+        { name = "mark-for-cudatoolkit-root-hook"; }
+        ./hooks/mark-for-cudatoolkit-root-hook.sh)
+    { });
+
+  # Normally propagated by cuda_nvcc or cudatoolkit through their depsHostHostPropagated
+  setupCudaHook = (final.callPackage
+    ({ makeSetupHook, backendStdenv }:
+      makeSetupHook
+        {
+          name = "setup-cuda-hook";
+
+          substitutions.ccRoot = "${backendStdenv.cc}";
+
+          # Required in addition to ccRoot as otherwise bin/gcc is looked up
+          # when building CMakeCUDACompilerId.cu
+          substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
+        }
+        ./hooks/setup-cuda-hook.sh)
+    { });
+
 in
 {
   inherit
     backendStdenv
     cudatoolkit
-    cudaFlags;
+    cudaFlags
+    markForCudatoolkitRootHook
+    setupCudaHook;
+
+    saxpy = final.callPackage ./saxpy { };
 }
diff --git a/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh
new file mode 100644
index 0000000000000..5c18760a3a2b0
--- /dev/null
+++ b/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh
@@ -0,0 +1,8 @@
+# shellcheck shell=bash
+
+markForCUDAToolkit_ROOT() {
+    mkdir -p "${prefix}/nix-support"
+    touch "${prefix}/nix-support/include-in-cudatoolkit-root"
+}
+
+fixupOutputHooks+=(markForCUDAToolkit_ROOT)
diff --git a/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh
new file mode 100644
index 0000000000000..89801eb7c21a5
--- /dev/null
+++ b/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh
@@ -0,0 +1,5 @@
+# shellcheck shell=bash
+
+# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
+# CUDAToolkit_ROOT. We have to help it locate libcudart
+export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include"
diff --git a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
new file mode 100644
index 0000000000000..89256e86f7898
--- /dev/null
+++ b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
@@ -0,0 +1,68 @@
+# shellcheck shell=bash
+
+echo Sourcing setup-cuda-hook >&2
+
+extendCUDAToolkit_ROOT() {
+    if [[ -f "$1/nix-support/include-in-cudatoolkit-root" ]] ; then
+        addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$1"
+
+        if [[ -d "$1/include" ]] ; then
+            addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$1/include"
+        fi
+    fi
+}
+
+addEnvHooks "$targetOffset" extendCUDAToolkit_ROOT
+
+setupCUDAToolkitCompilers() {
+    echo Executing setupCUDAToolkitCompilers >&2
+
+    if [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] ; then
+        return
+    fi
+
+    # Point NVCC at a compatible compiler
+
+    # For CMake-based projects:
+    # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
+    # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
+    # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
+
+    export cmakeFlags+=" -DCUDA_HOST_COMPILER=@ccFullPath@"
+    export cmakeFlags+=" -DCMAKE_CUDA_HOST_COMPILER=@ccFullPath@"
+
+    # For non-CMake projects:
+    # We prepend --compiler-bindir to nvcc flags.
+    # Downstream packages can override these, because NVCC
+    # uses the last --compiler-bindir it gets on the command line.
+    # FIXME: this results in "incompatible redefinition" warnings.
+    # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
+    if [ -z "${CUDAHOSTCXX-}" ]; then
+      export CUDAHOSTCXX="@ccFullPath@";
+    fi
+
+    export NVCC_PREPEND_FLAGS+=" --compiler-bindir=@ccRoot@/bin"
+
+    # NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled
+    #   binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
+    #   the default set of CUDA capabilities we build can regularly cause this to occur (for
+    #   example, with Magma).
+    #
+    # @SomeoneSerge: original comment was made by @ConnorBaker in .../cudatoolkit/common.nix
+    if [[ -z "${dontCompressFatbin-}" ]]; then
+        export NVCC_PREPEND_FLAGS+=" -Xfatbin=-compress-all"
+    fi
+
+    # CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
+    # CUDAToolkit_ROOT. We have to help it locate libcudart
+    if [[ -z "${nvccDontPrependCudartFlags-}" ]] ; then
+        export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include"
+    fi
+}
+
+setupCMakeCUDAToolkit_ROOT() {
+    export cmakeFlags+=" -DCUDAToolkit_INCLUDE_DIR=$CUDAToolkit_INCLUDE_DIR -DCUDAToolkit_ROOT=$CUDAToolkit_ROOT"
+}
+
+postHooks+=(setupCUDAToolkitCompilers)
+preConfigureHooks+=(setupCMakeCUDAToolkit_ROOT)
diff --git a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
index 3a6a16b1d7698..ec2c9cf72a9e2 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
@@ -4,6 +4,7 @@
 , fetchurl
 , autoPatchelfHook
 , autoAddOpenGLRunpathHook
+, markForCudatoolkitRootHook
 }:
 
 pname:
@@ -28,6 +29,7 @@ backendStdenv.mkDerivation {
     # directory to the rpath of all ELF binaries.
     # Check e.g. with `patchelf --print-rpath path/to/my/binary
     autoAddOpenGLRunpathHook
+    markForCudatoolkitRootHook
   ];
 
   buildInputs = [
diff --git a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
index 7b8e02de24741..b962b6caa1b27 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
@@ -27,35 +27,27 @@ in
       inherit (prev.backendStdenv) cc;
     in
     {
-      # Point NVCC at a compatible compiler
-      # FIXME: non-redist cudatoolkit copy-pastes this code
+      # Required by cmake's enable_language(CUDA) to build a test program
+      # When implementing cross-compilation support: this is
+      # final.pkgs.targetPackages.cudaPackages.cuda_cudart
+      env.cudartRoot = "${prev.lib.getDev final.cuda_cudart}";
 
-      # For CMake-based projects:
-      # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
-      # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
-      # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
+      # Point NVCC at a compatible compiler
 
-      # For non-CMake projects:
-      # We prepend --compiler-bindir to nvcc flags.
-      # Downstream packages can override these, because NVCC
-      # uses the last --compiler-bindir it gets on the command line.
-      # FIXME: this results in "incompatible redefinition" warnings.
-      # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
-      # NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the
-      #   compiled binaries. If binaries grow over 2GB, they will fail to link. This is a problem
-      #   for us, as the default set of CUDA capabilities we build can regularly cause this to
-      #   occur (for example, with Magma).
-      postInstall = (oldAttrs.postInstall or "") + ''
-        mkdir -p $out/nix-support
-        cat <<EOF >> $out/nix-support/setup-hook
-        cmakeFlags+=' -DCUDA_HOST_COMPILER=${cc}/bin'
-        cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${cc}/bin'
-        if [ -z "\''${CUDAHOSTCXX-}" ]; then
-          export CUDAHOSTCXX=${cc}/bin;
-        fi
-        export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${cc}/bin -Xfatbin=-compress-all'
-        EOF
-      '';
+      # Desiredata: whenever a package (e.g. magma) adds cuda_nvcc to
+      # nativeBuildInputs (offsets `(-1, 0)`), magma should also source the
+      # setupCudaHook, i.e. we want it the hook to be propagated into the
+      # same nativeBuildInputs.
+      #
+      # Logically, cuda_nvcc should include the hook in depsHostHostPropagated,
+      # so that the final offsets for the propagated hook would be `(-1, 0) +
+      # (0, 0) = (-1, 0)`.
+      #
+      # In practice, TargetTarget appears to work:
+      # https://gist.github.com/fd80ff142cd25e64603618a3700e7f82
+      depsTargetTargetPropagated = [
+        final.setupCudaHook
+      ];
     });
 
   cuda_nvprof = prev.cuda_nvprof.overrideAttrs (oldAttrs: {
diff --git a/pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt b/pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt
new file mode 100644
index 0000000000000..a6954e6e8bee2
--- /dev/null
+++ b/pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.25)
+project(saxpy LANGUAGES CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas)
+
+add_executable(saxpy saxpy.cu)
+target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m)
+target_compile_features(saxpy PRIVATE cxx_std_14)
+target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+                                     --expt-relaxed-constexpr>)
+
+install(TARGETS saxpy)
diff --git a/pkgs/development/compilers/cudatoolkit/saxpy/default.nix b/pkgs/development/compilers/cudatoolkit/saxpy/default.nix
new file mode 100644
index 0000000000000..f347b43d1d11c
--- /dev/null
+++ b/pkgs/development/compilers/cudatoolkit/saxpy/default.nix
@@ -0,0 +1,50 @@
+{ autoAddOpenGLRunpathHook
+, backendStdenv
+, cmake
+, cuda_cccl
+, cuda_cudart
+, cudaFlags
+, cuda_nvcc
+, lib
+, libcublas
+, setupCudaHook
+, stdenv
+}:
+
+backendStdenv.mkDerivation {
+  pname = "saxpy";
+  version = "unstable-2023-07-11";
+
+  src = ./.;
+
+  buildInputs = [
+    libcublas
+    cuda_cudart
+    cuda_cccl
+  ];
+  nativeBuildInputs = [
+    cmake
+
+    # NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for
+    # cross-compilation to work. This should work automatically once we move to
+    # spliced scopes. Delete this comment once that happens
+    cuda_nvcc
+
+    # Alternatively, we could remove the propagated hook from cuda_nvcc and add
+    # directly:
+    # setupCudaHook
+    autoAddOpenGLRunpathHook
+  ];
+
+  cmakeFlags = [
+    "-DCMAKE_VERBOSE_MAKEFILE=ON"
+    "-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}"
+  ];
+
+  meta = {
+    description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation";
+    license = lib.licenses.mit;
+    maintainers = lib.teams.cuda.members;
+    platforms = lib.platforms.unix;
+  };
+}
diff --git a/pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu b/pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu
new file mode 100644
index 0000000000000..912a6d1647b14
--- /dev/null
+++ b/pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu
@@ -0,0 +1,68 @@
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+#include <stdio.h>
+
+static inline void check(cudaError_t err, const char *context) {
+  if (err != cudaSuccess) {
+    fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
+    std::exit(EXIT_FAILURE);
+  }
+}
+
+#define CHECK(x) check(x, #x)
+
+__global__ void saxpy(int n, float a, float *x, float *y) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    y[i] = a * x[i] + y[i];
+}
+
+int main(void) {
+  setbuf(stderr, NULL);
+  fprintf(stderr, "Start\n");
+
+  int rtVersion, driverVersion;
+  CHECK(cudaRuntimeGetVersion(&rtVersion));
+  CHECK(cudaDriverGetVersion(&driverVersion));
+
+  fprintf(stderr, "Runtime version: %d\n", rtVersion);
+  fprintf(stderr, "Driver version: %d\n", driverVersion);
+
+  constexpr int N = 1 << 10;
+
+  std::vector<float> xHost(N), yHost(N);
+  for (int i = 0; i < N; i++) {
+    xHost[i] = 1.0f;
+    yHost[i] = 2.0f;
+  }
+
+  fprintf(stderr, "Host memory initialized, copying to the device\n");
+  fflush(stderr);
+
+  float *xDevice, *yDevice;
+  CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
+  CHECK(cudaMalloc(&yDevice, N * sizeof(float)));
+
+  CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
+                   cudaMemcpyHostToDevice));
+  CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
+                   cudaMemcpyHostToDevice));
+  fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");
+
+  saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
+  fprintf(stderr, "Scheduled a kernel call\n");
+  CHECK(cudaGetLastError());
+
+  CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
+                   cudaMemcpyDeviceToHost));
+
+  float maxError = 0.0f;
+  for (int i = 0; i < N; i++)
+    maxError = max(maxError, abs(yHost[i] - 4.0f));
+  fprintf(stderr, "Max error: %f\n", maxError);
+
+  CHECK(cudaFree(xDevice));
+  CHECK(cudaFree(yDevice));
+}
diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix
index d10da6843c36b..04f263568ce6f 100644
--- a/pkgs/development/libraries/science/math/magma/generic.nix
+++ b/pkgs/development/libraries/science/math/magma/generic.nix
@@ -86,29 +86,6 @@ let
     # "75" -> "750"  Cf. https://bitbucket.org/icl/magma/src/f4ec79e2c13a2347eff8a77a3be6f83bc2daec20/CMakeLists.txt#lines-273
     "${minArch'}0";
 
-  cuda-common-redist = with cudaPackages; [
-    libcublas # cublas_v2.h
-    libcusparse # cusparse.h
-  ];
-
-  # Build-time dependencies
-  cuda-native-redist = symlinkJoin {
-    name = "cuda-native-redist-${cudaVersion}";
-    paths = with cudaPackages; [
-      cuda_cudart # cuda_runtime.h
-      cuda_nvcc
-    ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
-      cuda_nvprof # <cuda_profiler_api.h>
-    ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
-      cuda_profiler_api # <cuda_profiler_api.h>
-    ] ++ cuda-common-redist;
-  };
-
-  # Run-time dependencies
-  cuda-redist = symlinkJoin {
-    name = "cuda-redist-${cudaVersion}";
-    paths = cuda-common-redist;
-  };
 in
 
 assert (builtins.match "[^[:space:]]*" gpuTargetString) != null;
@@ -128,16 +105,22 @@ stdenv.mkDerivation {
     ninja
     gfortran
   ] ++ lists.optionals cudaSupport [
-    cuda-native-redist
+    cudaPackages.cuda_nvcc
   ];
 
   buildInputs = [
     libpthreadstubs
     lapack
     blas
-  ] ++ lists.optionals cudaSupport [
-    cuda-redist
-  ] ++ lists.optionals rocmSupport [
+  ] ++ lists.optionals cudaSupport (with cudaPackages; [
+    cuda_cudart
+    libcublas # cublas_v2.h
+    libcusparse # cusparse.h
+  ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
+    cuda_nvprof # <cuda_profiler_api.h>
+  ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
+    cuda_profiler_api # <cuda_profiler_api.h>
+  ]) ++ lists.optionals rocmSupport [
     hip
     hipblas
     hipsparse