NixOS · ConnorBaker · Nov 26, 2025 · Nov 25, 2025 · GaetanLepage · Dec 3, 2025
diff --git a/pkgs/development/cuda-modules/packages/cuda_nvcc.nix b/pkgs/development/cuda-modules/packages/cuda_nvcc.nix
@@ -8,6 +8,7 @@
   cuda_cccl,
   lib,
   libnvvm,
+  makeBinaryWrapper,
 }:
 buildRedist (finalAttrs: {
   redistName = "cuda";
@@ -22,6 +23,10 @@ buildRedist (finalAttrs: {
   # The nvcc and cicc binaries contain hard-coded references to /usr
   allowFHSReferences = true;
 
+  nativeBuildInputs = [
+    makeBinaryWrapper
+  ];
+
   # Entries here will be in nativeBuildInputs when cuda_nvcc is in nativeBuildInputs
   propagatedBuildInputs = [ setupCudaHook ];
 
@@ -144,13 +149,21 @@ buildRedist (finalAttrs: {
         EOF
       ''
       # Add the dependency on backendStdenv.cc to the nvcc.profile.
+      # NOTE: NVCC explodes in horrifying fashion if GCC is not on PATH -- it fails even before
+      # reading nvcc.profile!
       + ''
-        nixLog "adding backendStdenv.cc to nvcc.profile"
+        nixLog "setting compiler-bindir to backendStdenv.cc in nvcc.profile"
         cat << EOF >> "''${!outputBin:?}/bin/nvcc.profile"
-
         # Fix a compatible backend compiler
-        PATH += "${backendStdenv.cc}/bin":
+        compiler-bindir = ${backendStdenv.cc}/bin
         EOF
+
+        nixLog "wrapping nvcc to add backendStdenv.cc to its PATH"
+        mv "''${!outputBin:?}/bin/nvcc" "''${!outputBin:?}/bin/.nvcc-wrapped"
+        makeBinaryWrapper \
+          "''${!outputBin:?}/bin/.nvcc-wrapped" \
+          "''${!outputBin:?}/bin/nvcc" \
+          --prefix PATH : ${lib.makeBinPath [ backendStdenv.cc ]}
       ''
     );
 

diff --git a/pkgs/development/cuda-modules/packages/setupCudaHook/package.nix b/pkgs/development/cuda-modules/packages/setupCudaHook/package.nix
@@ -5,9 +5,6 @@ makeSetupHook {
 
   substitutions.setupCudaHook = placeholder "out";
 
-  # Point NVCC at a compatible compiler
-  substitutions.ccRoot = "${backendStdenv.cc}";
-
   # Required in addition to ccRoot as otherwise bin/gcc is looked up
   # when building CMakeCUDACompilerId.cu
   substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";

diff --git a/pkgs/development/cuda-modules/packages/setupCudaHook/setup-cuda-hook.sh b/pkgs/development/cuda-modules/packages/setupCudaHook/setup-cuda-hook.sh
@@ -92,8 +92,6 @@ setupCUDAToolkitCompilers() {
     export CUDAHOSTCXX="@ccFullPath@"
   fi
 
-  appendToVar NVCC_PREPEND_FLAGS "--compiler-bindir=@ccRoot@/bin"
-
   # NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled
   #   binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
   #   the default set of CUDA capabilities we build can regularly cause this to occur (for

diff --git a/pkgs/development/python-modules/pycuda/compyte.nix b/pkgs/development/python-modules/pycuda/compyte.nix
diff --git a/pkgs/development/python-modules/pycuda/default.nix b/pkgs/development/python-modules/pycuda/default.nix
@@ -1,81 +1,233 @@
 {
+  _cuda,
+  boost,
   buildPythonPackage,
-  addDriverRunpath,
-  fetchPypi,
+  cudaPackages,
   fetchFromGitHub,
+  lib,
   mako,
-  boost,
   numpy,
-  pytools,
-  pytest,
-  decorator,
-  appdirs,
-  six,
-  cudaPackages,
+  platformdirs,
   python,
-  mkDerivation,
-  lib,
+  pytools,
+  setuptools,
+  wheel,
+  writableTmpDirAsHomeHook,
+  writeShellApplication,
+  stdenvNoCC,
 }:
 let
-  compyte = import ./compyte.nix { inherit mkDerivation fetchFromGitHub; };
-
-  inherit (cudaPackages) cudatoolkit;
+  inherit (_cuda.lib) dropDots;
+  inherit (cudaPackages)
+    backendStdenv
+    cuda_cudart
+    cuda_cccl
+    cuda_nvcc
+    cuda_profiler_api
+    libcurand
+    ;
+  inherit (lib)
+    getBin
+    getFirstOutput
+    getInclude
+    getLib
+    licenses
+    maintainers
+    teams
+    ;
 in
-buildPythonPackage rec {
+buildPythonPackage {
+  __structuredAttrs = true;
+
   pname = "pycuda";
   version = "2025.1.2";
-  format = "setuptools";
 
-  src = fetchPypi {
-    inherit pname version;
-    hash = "sha256-DdgpEdctjgPGMSiuROmc+3tGiQlKumzFGT2OlEcXqvo=";
+  pyproject = true;
+
+  stdenv = backendStdenv;
+
+  src = fetchFromGitHub {
+    owner = "inducer";
+    repo = "pycuda";
+    tag = "v2025.1.2";
+    hash = "sha256-JMGVNjiKCAno29df8Zk3njvpgvz9JE8mb0HeJMVTnCQ=";
+    # Use the vendored compyte source rather than tracking it as a separate dependency.
+    # As an added bonus, this should unbreak the update script added by buildPythonPackage.
+    fetchSubmodules = true;
   };
 
-  preConfigure = with lib.versions; ''
-    ${python.pythonOnBuildForHost.interpreter} configure.py --boost-inc-dir=${boost.dev}/include \
-                          --boost-lib-dir=${boost}/lib \
-                          --no-use-shipped-boost \
-                          --boost-python-libname=boost_python${major python.version}${minor python.version} \
-                          --cuda-root=${cudatoolkit}
-  '';
+  build-system = [
+    setuptools
+    wheel
+  ];
 
-  postInstall = ''
-    ln -s ${compyte} $out/${python.sitePackages}/pycuda/compyte
-  '';
+  nativeBuildInputs = [
+    cuda_nvcc
+  ];
 
-  postFixup = ''
-    find $out/lib -type f \( -name '*.so' -or -name '*.so.*' \) | while read lib; do
-      echo "setting opengl runpath for $lib..."
-      addDriverRunpath "$lib"
-    done
-  '';
+  prePatch = ''
+    nixLog "removing vendored boost source"
+    rm -rf "$PWD/bpl-subset"
 
-  # Requires access to libcuda.so.1 which is provided by the driver
-  doCheck = false;
+    nixLog "patching $PWD/pycuda/compiler.py::compile to fix CUDA include paths"
+    substituteInPlace "$PWD/pycuda/compiler.py" \
+    --replace-fail \
+    'include_dirs = [*include_dirs, _find_pycuda_include_path()]' \
+    'include_dirs = [
+        *include_dirs,
+        "${getInclude cuda_nvcc}/include",
+        "${getInclude cuda_cudart}/include",
+        "${getInclude cuda_cccl}/include",
+        "${getInclude cuda_profiler_api}/include",
+        "${getInclude libcurand}/include",
+        _find_pycuda_include_path(),
+    ]'
 
-  checkPhase = ''
-    py.test
-  '';
+    nixLog "patching $PWD/pycuda/compiler.py::compile to fix NVCC path"
+    substituteInPlace "$PWD/pycuda/compiler.py" \
+      --replace-fail \
+        'nvcc="nvcc"' \
+        'nvcc="${getBin cuda_nvcc}/bin/nvcc"'
 
-  nativeBuildInputs = [ addDriverRunpath ];
+    nixLog "patching $PWD/pycuda/compiler.py::DynamicModule.__init__ to fix CUDA runtime library path"
+    substituteInPlace "$PWD/pycuda/compiler.py" \
+      --replace-fail \
+        'cuda_libdir=None,' \
+        'cuda_libdir="${getLib cuda_cudart}/lib",'
+  '';
 
-  propagatedBuildInputs = [
+  dependencies = [
+    boost
+    mako
     numpy
+    platformdirs
     pytools
-    pytest
-    decorator
-    appdirs
-    six
-    cudatoolkit
-    compyte
-    python
-    mako
   ];
 
-  meta = with lib; {
-    homepage = "https://github.com/inducer/pycuda/";
+  buildInputs = [
+    cuda_cccl
+    cuda_cudart
+    cuda_nvcc
+    cuda_profiler_api
+    libcurand
+  ];
+
+  configureScript = "./configure.py";
+
+  # configure.py doesn't support the installation directory arguments _multioutConfig sets.
+  # The other argument provided by configurePhase, like --prefix, --enable-shared, and --disable-static are ignored.
+  setOutputFlags = false;
+
+  configureFlags = [
+    # The expected boost python library name is something like boost_python-py313, but our library name doesn't have a
+    # hyphen. The pythonVersion is already a major-minor version, so we just need to remove the dot.
+    "--no-use-shipped-boost"
+    "--boost-python-libname=boost_python${dropDots python.pythonVersion}"
+    # Provide paths to our CUDA libraries.
+    "--cudadrv-lib-dir=${getFirstOutput [ "stubs" "lib" ] cuda_cudart}/lib/stubs"
+  ];
+
+  # Requires access to libcuda.so.1 which is provided by the driver
+  doCheck = false;
+
+  # From setup.py
+  pythonImportsCheck = [
+    "pycuda"
+    # "pycuda.gl" # Requires the CUDA driver
+    "pycuda.sparse"
+    "pycuda.compyte"
+  ];
+
+  # TODO: Split into testers and tests.
+  # NOTE: Tests take 23m to run on a 4090 and require 18GB of VRAM.
+  passthru = {
+    testers.tester = writeShellApplication {
+      derivationArgs = {
+        __structuredAttrs = true;
+        strictDeps = true;
+      };
+      name = "pycuda-tester";
+      runtimeInputs = [
+        (python.withPackages (ps: [
+          ps.pycuda
+          ps.pytest
+        ]))
+      ];
+      text = ''
+        echo "Copying pycuda test sources to $PWD/pycuda_test_sources"
+        mkdir -p "$PWD/pycuda_test_sources"
+        cp -r "${python.pkgs.pycuda.src}"/test "$PWD/pycuda_test_sources"
+        chmod -R u+w "$PWD/pycuda_test_sources"
+
+        pushd "$PWD/pycuda_test_sources"
+        pytest "$@"
+        popd
+      '';
+    };
+    tests =
+      let
+        makeTest =
+          name: testArgs:
+          stdenvNoCC.mkDerivation {
+            __structuredAttrs = true;
+            strictDeps = true;
+
+            inherit name testArgs;
+
+            dontUnpack = true;
+
+            nativeBuildInputs = [
+              # cuda_nvcc
+              python.pkgs.pycuda.passthru.testers.tester
+              writableTmpDirAsHomeHook
+            ];
+
+            dontConfigure = true;
+
+            buildPhase = ''
+              nixLog "using testArgs: ''${testArgs[*]@Q}"
+              pycuda-tester "''${testArgs[@]}" || {
+                nixErrorLog "pycuda-tester finished with non-zero exit code: $?"
+                exit 1
+              }
+            '';
+
+            postInstall = ''
+              touch $out
+            '';
+
+            requiredSystemFeatures = [ "cuda" ];
+          };
+      in
+      {
+        # Fast: ~10s on a 4090
+        driver = makeTest "pycuda-driver-tests" [ "test/test_driver.py" ];
+        # Fast: ~16s on a 4090
+        cumath = makeTest "pycuda-cumath-tests" [ "test/test_cumath.py" ];
+        # Fast: ~3m on a 4090
+        gpuarray-fast = makeTest "pycuda-gpuarray-fast-tests" [
+          "test/test_gpuarray.py"
+          "-k"
+          "not test_curand_wrappers"
+        ];
+        # EXTREMELY SLOW: ~20m on a 4090
+        gpuarray-slow = makeTest "pycuda-gpuarray-slow-tests" [
+          "test/test_gpuarray.py"
+          "-k"
+          "test_curand_wrappers"
+        ];
+      };
+  };
+
+  meta = {
     description = "CUDA integration for Python";
+    homepage = "https://github.com/inducer/pycuda/";
     license = licenses.mit;
-    maintainers = [ ];
+    platforms = [
+      "aarch64-linux"
+      "x86_64-linux"
+    ];
+    maintainers = with maintainers; [ connorbaker ];
+    teams = [ teams.cuda ];
   };
 }
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
@@ -13017,7 +13017,7 @@ self: super: with self; {
 
   pyctr = callPackage ../development/python-modules/pyctr { };
 
-  pycuda = callPackage ../development/python-modules/pycuda { inherit (pkgs.stdenv) mkDerivation; };
+  pycuda = callPackage ../development/python-modules/pycuda { };
 
   pycups = callPackage ../development/python-modules/pycups { };