From 94bbbb047180b59ef6f25f4d89e7a12b5fc6db63 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sat, 25 Feb 2023 16:32:55 +0200
Subject: [PATCH 01/13] cudaPackages: point nvcc at a compatible -ccbin

This is a hot-fix to un-break cuda-enabled packages (like tensorflow,
jaxlib, faiss, opencv, ...) after the gcc11->gcc12 bump. We should
probably build the whole downstream packages with a compatible stdenv
(such as gcc11Stdenv for cudaPackages_11), but just pointing nvcc at the
right compiler seems to do the trick

We already used this hack for non-redist cudatoolkit. Now we use it more
consistently.

This commit also re-links cuda packages against libstdc++ from the same
"compatible" gcc, rather than the current stdenv. We didn't test if this
is necessary -> need to revise in further PRs.

NOTE: long-term we should make it possible to override -ccbin and use
e.g. clang
---
 .../compilers/cudatoolkit/common.nix          | 40 +++++++++++++++----
 .../redist/build-cuda-redist-package.nix      | 14 ++++++-
 .../cudatoolkit/redist/overrides.nix          | 39 +++++++++++++++++-
 .../libraries/science/math/nccl/default.nix   | 31 ++++++++++----
 4 files changed, 106 insertions(+), 18 deletions(-)
diff --git a/pkgs/development/compilers/cudatoolkit/common.nix b/pkgs/development/compilers/cudatoolkit/common.nix
index e986ae2dc14da..a94f6fbdaf736 100644
--- a/pkgs/development/compilers/cudatoolkit/common.nix
+++ b/pkgs/development/compilers/cudatoolkit/common.nix
@@ -151,9 +151,31 @@ stdenv.mkDerivation rec {
     mkdir -p $out/nix-support
     echo "cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'" >> $out/nix-support/setup-hook
 
-    # Set the host compiler to be used by nvcc for CMake-based projects:
+    # Set the host compiler to be used by nvcc.
+    # FIXME: redist cuda_nvcc copy-pastes this code
+
+    # For CMake-based projects:
     # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
-    echo "cmakeFlags+=' -DCUDA_HOST_COMPILER=${gcc}/bin'" >> $out/nix-support/setup-hook
+    # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
+    # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
+
+    # For non-CMake projects:
+    # FIXME: results in "incompatible redefinition" warnings ...but we keep
+    # both this and cmake variables until we come up with a more general
+    # solution
+    # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
+
+    cat <<EOF >> $out/nix-support/setup-hook
+
+    cmakeFlags+=' -DCUDA_HOST_COMPILER=${gcc}/bin'
+    cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${gcc}/bin'
+    if [ -z "\''${CUDAHOSTCXX-}" ]; then
+      export CUDAHOSTCXX=${gcc}/bin;
+    fi
+
+    export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${gcc}/bin'
+    EOF
+
 
     # Move some libraries to the lib output so that programs that
     # depend on them don't pull in this entire monstrosity.
@@ -167,10 +189,6 @@ stdenv.mkDerivation rec {
       mv $out/extras/CUPTI/lib64/libcupti* $out/lib
     ''}
 
-    # Set compiler for NVCC.
-    wrapProgram $out/bin/nvcc \
-      --prefix PATH : ${gcc}/bin
-
     # nvprof do not find any program to profile if LD_LIBRARY_PATH is not set
     wrapProgram $out/bin/nvprof \
       --prefix LD_LIBRARY_PATH : $out/lib
@@ -191,7 +209,15 @@ stdenv.mkDerivation rec {
   preFixup =
     let rpath = lib.concatStringsSep ":" [
       (lib.makeLibraryPath (runtimeDependencies ++ [ "$lib" "$out" "$out/nvvm" ]))
-      "${stdenv.cc.cc.lib}/lib64"
+
+      # The path to libstdc++ and such
+      #
+      # NB:
+      # 1. "gcc" (gcc-wrapper) here is what's exposed as cudaPackages.cudatoolkit.cc
+      # 2. "gcc.cc" is the unwrapped gcc
+      # 3. "gcc.cc.lib" is one of its outputs
+      "${gcc.cc.lib}/lib64"
+
       "$out/jre/lib/amd64/jli"
       "$out/lib64"
       "$out/nvvm/lib64"
diff --git a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
index 9bbd7ea1da119..3bf9184eefabb 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
@@ -10,7 +10,8 @@ attrs:
 
 let
   arch = "linux-x86_64";
-in stdenv.mkDerivation {
+in
+stdenv.mkDerivation {
   inherit pname;
   inherit (attrs) version;
 
@@ -29,7 +30,14 @@ in stdenv.mkDerivation {
   ];
 
   buildInputs = [
-    stdenv.cc.cc.lib
+    # autoPatchelfHook will search for a libstdc++ and we're giving it a
+    # "compatible" libstdc++ from the same toolchain that NVCC uses.
+    #
+    # E.g. it might happen that stdenv=gcc12Stdenv, but we build against cuda11
+    # that only "supports" gcc11. Linking against gcc12's libraries we might
+    # sometimes actually sometimes encounter dynamic linkage errors at runtime
+    # NB: We don't actually know if this is the right thing to do
+    cudatoolkit.cc.cc.lib
   ];
 
   dontBuild = true;
@@ -43,6 +51,8 @@ in stdenv.mkDerivation {
     runHook postInstall
   '';
 
+  passthru.stdenv = stdenv;
+
   meta = {
     description = attrs.name;
     license = lib.licenses.unfree;
diff --git a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
index bcf16db6e12eb..663af1db7632b 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
@@ -1,6 +1,8 @@
-final: prev: let
+final: prev:
+let
   inherit (prev) lib pkgs;
-in (lib.filterAttrs (attr: _: (prev ? "${attr}")) {
+in
+(lib.filterAttrs (attr: _: (prev ? "${attr}")) {
   ### Overrides to fix the components of cudatoolkit-redist
 
   # Attributes that don't exist in the previous set are removed.
@@ -20,6 +22,39 @@ in (lib.filterAttrs (attr: _: (prev ? "${attr}")) {
     prev.libcublas
   ];
 
+  cuda_nvcc = prev.cuda_nvcc.overrideAttrs (oldAttrs:
+    let
+      inherit (prev.cudatoolkit) cc;
+    in
+    {
+      # Point NVCC at a compatible compiler
+      # FIXME: non-redist cudatoolkit copy-pastes this code
+
+      # For CMake-based projects:
+      # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
+      # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
+      # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
+
+      # For non-CMake projects:
+      # We prepend --compiler-bindir to nvcc flags.
+      # Downstream packages can override these, because NVCC
+      # uses the last --compiler-bindir it gets on the command line.
+      # FIXME: this results in "incompatible redefinition" warnings.
+      # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
+      postInstall = (oldAttrs.postInstall or "") + ''
+        mkdir -p $out/nix-support
+        cat <<EOF >> $out/nix-support/setup-hook
+        cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
+        cmakeFlags+=' -DCUDA_HOST_COMPILER=${cc}/bin'
+        cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${cc}/bin'
+        if [ -z "\''${CUDAHOSTCXX-}" ]; then
+          export CUDAHOSTCXX=${cc}/bin;
+        fi
+        export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${cc}/bin'
+        EOF
+      '';
+    });
+
   cuda_nvprof = prev.cuda_nvprof.overrideAttrs (oldAttrs: {
     nativeBuildInputs = oldAttrs.nativeBuildInputs ++ [ pkgs.addOpenGLRunpath ];
     buildInputs = oldAttrs.buildInputs ++ [ prev.cuda_cupti ];
diff --git a/pkgs/development/libraries/science/math/nccl/default.nix b/pkgs/development/libraries/science/math/nccl/default.nix
index 99aed3a6a30ef..df76cda96b1d4 100644
--- a/pkgs/development/libraries/science/math/nccl/default.nix
+++ b/pkgs/development/libraries/science/math/nccl/default.nix
@@ -1,11 +1,9 @@
 { lib, stdenv, fetchFromGitHub, which, cudaPackages, addOpenGLRunpath }:
 
-let
-  inherit (cudaPackages) cudatoolkit;
-in
+with cudaPackages;
 
 stdenv.mkDerivation rec {
-  name = "nccl-${version}-cuda-${cudatoolkit.majorVersion}";
+  name = "nccl-${version}-cuda-${cudaPackages.cudaMajorVersion}";
   version = "2.16.5-1";
 
   src = fetchFromGitHub {
@@ -17,16 +15,35 @@ stdenv.mkDerivation rec {
 
   outputs = [ "out" "dev" ];
 
-  nativeBuildInputs = [ which addOpenGLRunpath ];
+  nativeBuildInputs = [
+    which
+    addOpenGLRunpath
+    cuda_nvcc
+  ];
 
-  buildInputs = [ cudatoolkit ];
+  buildInputs = [
+    cuda_cudart
+  ];
 
   preConfigure = ''
     patchShebangs src/collectives/device/gen_rules.sh
+  ''
+  # We need NVCC to use a compatible backend compiler (we maintain a link to
+  # that in `cudatoolkit.cc`). We ship NVCC with a setup-hook that *prepends*
+  # the correct -ccbin to nvcc flags. NCCL's Makefile, however, appends another
+  # -ccbin, which points at the host platform's compiler, coming from the
+  # `stdenv`. Confer
+  # https://github.com/NVIDIA/nccl/blob/f3d51667838f7542df8ea32ea4e144d812b3ed7c/makefiles/common.mk#L65
+  # Since NVCC will use the last -ccbin on the command-line, we append the correct path again.
+  # We hope it's a temporary solution
+  + ''
+    export NVCC_APPEND_FLAGS+=' --compiler-bindir=${cudatoolkit.cc}/bin'
   '';
 
   makeFlags = [
-    "CUDA_HOME=${cudatoolkit}"
+    "CUDA_HOME=${cuda_nvcc}"
+    "CUDA_LIB=${cuda_cudart}/lib64"
+    "CUDA_INC=${cuda_cudart}/include"
     "PREFIX=$(out)"
   ];
 

From cf7fb1d08f928f48725f15e595cbb84793278379 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Feb 2023 14:54:09 +0200
Subject: [PATCH 02/13] python3Packages.tensorflow: add cudaCapabilities
 argument

Rearrange tensorflow to allow overriding cudaCapabilities.
This is needed when debugging the tensorflow derivation
---
 pkgs/development/compilers/cudatoolkit/flags.nix       | 3 +--
 pkgs/development/python-modules/tensorflow/default.nix | 8 +++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/flags.nix b/pkgs/development/compilers/cudatoolkit/flags.nix
index 8e1e54723b2e4..9d7b7f884ad2f 100644
--- a/pkgs/development/compilers/cudatoolkit/flags.nix
+++ b/pkgs/development/compilers/cudatoolkit/flags.nix
@@ -1,6 +1,6 @@
 { config
 , lib
-, cudatoolkit
+, cudaVersion
 }:
 
 # Type aliases
@@ -13,7 +13,6 @@
 
 let
   inherit (lib) attrsets lists strings trivial versions;
-  cudaVersion = cudatoolkit.version;
 
   # Flags are determined based on your CUDA toolkit by default.  You may benefit
   # from improved performance, reduced file size, or greater hardware suppport by
diff --git a/pkgs/development/python-modules/tensorflow/default.nix b/pkgs/development/python-modules/tensorflow/default.nix
index f7d920c372217..f18a924c31fa2 100644
--- a/pkgs/development/python-modules/tensorflow/default.nix
+++ b/pkgs/development/python-modules/tensorflow/default.nix
@@ -17,7 +17,9 @@
 # that in nix as well. It would make some things easier and less confusing, but
 # it would also make the default tensorflow package unfree. See
 # https://groups.google.com/a/tensorflow.org/forum/#!topic/developers/iRCt5m4qUz0
-, cudaSupport ? false, cudaPackages ? {}
+, cudaSupport ? false
+, cudaPackages ? { }
+, cudaCapabilities ? cudaPackages.cudaFlags.cudaCapabilities
 , mklSupport ? false, mkl
 , tensorboardSupport ? true
 # XLA without CUDA is broken
@@ -30,7 +32,7 @@
 }:
 
 let
-  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
+  inherit (cudaPackages) cudatoolkit cudnn nccl;
 in
 
 assert cudaSupport -> cudatoolkit != null
@@ -301,7 +303,7 @@ let
     TF_CUDA_PATHS = lib.optionalString cudaSupport "${cudatoolkit_joined},${cudnn},${nccl}";
     GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
     GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/gcc";
-    TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArches;
+    TF_CUDA_COMPUTE_CAPABILITIES = lib.concatStringsSep "," cudaCapabilities;
 
     postPatch = ''
       # bazel 3.3 should work just as well as bazel 3.1

From 79397957e876ef7fe6eccbcb58d23fb5c58f121c Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Feb 2023 16:21:56 +0200
Subject: [PATCH 03/13] cudaPackages.nccl: respect cudaCapabilities

---
 .../libraries/science/math/nccl/default.nix   | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/pkgs/development/libraries/science/math/nccl/default.nix b/pkgs/development/libraries/science/math/nccl/default.nix
index df76cda96b1d4..c047961c6c003 100644
--- a/pkgs/development/libraries/science/math/nccl/default.nix
+++ b/pkgs/development/libraries/science/math/nccl/default.nix
@@ -1,8 +1,18 @@
-{ lib, stdenv, fetchFromGitHub, which, cudaPackages, addOpenGLRunpath }:
+{ lib
+, backendStdenv
+, fetchFromGitHub
+, which
+, cudaPackages ? { }
+, addOpenGLRunpath
+}:
 
 with cudaPackages;
 
-stdenv.mkDerivation rec {
+let
+  # Output looks like "-gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_86,code=compute_86"
+  gencode = lib.concatStringsSep " " cudaFlags.cudaGencode;
+in
+backendStdenv.mkDerivation rec {
   name = "nccl-${version}-cuda-${cudaPackages.cudaMajorVersion}";
   version = "2.16.5-1";
 
@@ -27,17 +37,9 @@ stdenv.mkDerivation rec {
 
   preConfigure = ''
     patchShebangs src/collectives/device/gen_rules.sh
-  ''
-  # We need NVCC to use a compatible backend compiler (we maintain a link to
-  # that in `cudatoolkit.cc`). We ship NVCC with a setup-hook that *prepends*
-  # the correct -ccbin to nvcc flags. NCCL's Makefile, however, appends another
-  # -ccbin, which points at the host platform's compiler, coming from the
-  # `stdenv`. Confer
-  # https://github.com/NVIDIA/nccl/blob/f3d51667838f7542df8ea32ea4e144d812b3ed7c/makefiles/common.mk#L65
-  # Since NVCC will use the last -ccbin on the command-line, we append the correct path again.
-  # We hope it's a temporary solution
-  + ''
-    export NVCC_APPEND_FLAGS+=' --compiler-bindir=${cudatoolkit.cc}/bin'
+    makeFlagsArray+=(
+      "NVCC_GENCODE=${gencode}"
+    )
   '';
 
   makeFlags = [

From e305011223c940a8dd661f64eb5cd5384c15ddbe Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Wed, 1 Mar 2023 16:39:04 +0200
Subject: [PATCH 04/13] cudaPackages_12.nccl: fix new missing inputs

---
 pkgs/development/libraries/science/math/nccl/default.nix | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkgs/development/libraries/science/math/nccl/default.nix b/pkgs/development/libraries/science/math/nccl/default.nix
index c047961c6c003..4f82de8e8b6fa 100644
--- a/pkgs/development/libraries/science/math/nccl/default.nix
+++ b/pkgs/development/libraries/science/math/nccl/default.nix
@@ -33,6 +33,8 @@ backendStdenv.mkDerivation rec {
 
   buildInputs = [
     cuda_cudart
+  ] ++ lib.optionals (lib.versionAtLeast cudaVersion "12.0.0") [
+    cuda_cccl
   ];
 
   preConfigure = ''

From d378cc6fb23d67f3d9f86c39051f810c563789ca Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Feb 2023 14:58:14 +0200
Subject: [PATCH 05/13] opencv4: respect config.cudaCapabilities

This is needed for faster builds when debugging the opencv derivation,
and it's more consistent with other cuda-enabled packages

-DCUDA_GENERATION seems to expect architecture names, so we refactor
cudaFlags to facilitate easier extraction of the configured archnames
---
 .../science/math/mxnet/default.nix            |   2 +-
 .../compilers/cudatoolkit/flags.nix           | 140 +++++++++++-------
 pkgs/development/libraries/opencv/4.x.nix     |  13 +-
 .../libraries/science/math/magma/generic.nix  |   6 +-
 .../libraries/science/math/nccl/default.nix   |   2 +-
 .../python-modules/jaxlib/default.nix         |   2 +-
 6 files changed, 102 insertions(+), 63 deletions(-)

diff --git a/pkgs/applications/science/math/mxnet/default.nix b/pkgs/applications/science/math/mxnet/default.nix
index c1a329c608864..240a1759397fe 100644
--- a/pkgs/applications/science/math/mxnet/default.nix
+++ b/pkgs/applications/science/math/mxnet/default.nix
@@ -50,7 +50,7 @@ stdenv.mkDerivation rec {
       "-DUSE_OLDCMAKECUDA=ON"  # see https://github.com/apache/incubator-mxnet/issues/10743
       "-DCUDA_ARCH_NAME=All"
       "-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
-      "-DMXNET_CUDA_ARCH=${builtins.concatStringsSep ";" cudaFlags.cudaRealArches}"
+      "-DMXNET_CUDA_ARCH=${builtins.concatStringsSep ";" cudaFlags.realArches}"
     ] else [ "-DUSE_CUDA=OFF" ])
     ++ lib.optional (!cudnnSupport) "-DUSE_CUDNN=OFF";
 
diff --git a/pkgs/development/compilers/cudatoolkit/flags.nix b/pkgs/development/compilers/cudatoolkit/flags.nix
index 9d7b7f884ad2f..a43485a7dcfd4 100644
--- a/pkgs/development/compilers/cudatoolkit/flags.nix
+++ b/pkgs/development/compilers/cudatoolkit/flags.nix
@@ -18,8 +18,15 @@ let
   # from improved performance, reduced file size, or greater hardware suppport by
   # passing a configuration based on your specific GPU environment.
   #
-  # config.cudaCapabilities: list of hardware generations to support (e.g., "8.0")
-  # config.cudaForwardCompat: bool for compatibility with future GPU generations
+  # config.cudaCapabilities :: List Capability
+  # List of hardware generations to build
+  # Last item is considered the optional forward-compatibility arch
+  # E.g. [ "8.0" ]
+  #
+  # config.cudaForwardCompat :: Bool
+  # Whether to include the forward compatibility gencode (+PTX)
+  # to support future GPU generations:
+  # E.g. true
   #
   # Please see the accompanying documentation or https://github.com/NixOS/nixpkgs/pull/205351
 
@@ -39,6 +46,9 @@ let
   # GPUs which are supported by the provided CUDA version.
   supportedGpus = builtins.filter isSupported gpus;
 
+  # supportedCapabilities :: List Capability
+  supportedCapabilities = lists.map (gpu: gpu.computeCapability) supportedGpus;
+
   # cudaArchNameToVersions :: AttrSet String (List String)
   # Maps the name of a GPU architecture to different versions of that architecture.
   # For example, "Ampere" maps to [ "8.0" "8.6" "8.7" ].
@@ -49,12 +59,6 @@ let
       (gpu: gpu.archName)
       supportedGpus;
 
-  # cudaArchNames :: List String
-  # NOTE: It's important that we don't rely on builtins.attrNames cudaArchNameToVersions here;
-  #   otherwise, we'll get the names sorted in alphabetical order. The JSON array we read them
-  #   from is already sorted, so we'll preserve that order here.
-  cudaArchNames = lists.unique (lists.map (gpu: gpu.archName) supportedGpus);
-
   # cudaComputeCapabilityToName :: AttrSet String String
   # Maps the version of a GPU architecture to the name of that architecture.
   # For example, "8.0" maps to "Ampere".
@@ -67,23 +71,6 @@ let
       supportedGpus
   );
 
-  # cudaComputeCapabilities :: List String
-  # NOTE: It's important that we don't rely on builtins.attrNames cudaComputeCapabilityToName here;
-  #   otherwise, we'll get the versions sorted in alphabetical order. The JSON array we read them
-  #   from is already sorted, so we'll preserve that order here.
-  # Use the user-provided list of CUDA capabilities if it's provided.
-  cudaComputeCapabilities = config.cudaCapabilities
-    or (lists.map (gpu: gpu.computeCapability) supportedGpus);
-
-  # cudaForwardComputeCapability :: String
-  cudaForwardComputeCapability = (lists.last cudaComputeCapabilities) + "+PTX";
-
-  # cudaComputeCapabilitiesAndForward :: List String
-  # The list of supported CUDA architectures, including the forward compatibility architecture.
-  # If forward compatibility is disabled, this will be the same as cudaComputeCapabilities.
-  cudaComputeCapabilitiesAndForward = cudaComputeCapabilities
-    ++ lists.optional (config.cudaForwardCompat or true) cudaForwardComputeCapability;
-
   # dropDot :: String -> String
   dropDot = ver: builtins.replaceStrings [ "." ] [ "" ] ver;
 
@@ -101,38 +88,79 @@ let
     "-gencode=arch=compute_${dropDot computeCapability},code=${feat}_${dropDot computeCapability}"
   );
 
-  # cudaRealArches :: List String
-  # The real architectures are physical architectures supported by the CUDA version.
-  # For example, "sm_80".
-  cudaRealArches = archMapper "sm" cudaComputeCapabilities;
-
-  # cudaVirtualArches :: List String
-  # The virtual architectures are typically used for forward compatibility, when trying to support
-  # an architecture newer than the CUDA version allows.
-  # For example, "compute_80".
-  cudaVirtualArches = archMapper "compute" cudaComputeCapabilities;
-
-  # cudaArches :: List String
-  # By default, build for all supported architectures and forward compatibility via a virtual
-  # architecture for the newest supported architecture.
-  cudaArches = cudaRealArches ++
-    lists.optional (config.cudaForwardCompat or true) (lists.last cudaVirtualArches);
-
-  # cudaGencode :: List String
-  # A list of CUDA gencode arguments to pass to NVCC.
-  cudaGencode =
-    let
-      base = gencodeMapper "sm" cudaComputeCapabilities;
-      forwardCompat = gencodeMapper "compute" [ (lists.last cudaComputeCapabilities) ];
-    in
-    base ++ lists.optionals (config.cudaForwardCompat or true) forwardCompat;
+  formatCapabilities = { cudaCapabilities, enableForwardCompat ? true }: rec {
+    inherit cudaCapabilities enableForwardCompat;
+
+    # forwardCapability :: String
+    # Forward "compute" capability, a.k.a PTX
+    # E.g. "8.6+PTX"
+    forwardCapability = (lists.last cudaCapabilities) + "+PTX";
+
+    # capabilitiesAndForward :: List String
+    # The list of supported CUDA architectures, including the forward compatibility architecture.
+    # If forward compatibility is disabled, this will be the same as cudaCapabilities.
+    # E.g. [ "7.5" "8.6" "8.6+PTX" ]
+    capabilitiesAndForward = cudaCapabilities ++ lists.optionals enableForwardCompat [ forwardCapability ];
+
+    # archNames :: List String
+    # E.g. [ "Turing" "Ampere" ]
+    archNames = lists.unique (builtins.map (cap: cudaComputeCapabilityToName.${cap}) cudaCapabilities);
+
+    # realArches :: List String
+    # The real architectures are physical architectures supported by the CUDA version.
+    # E.g. [ "sm_75" "sm_86" ]
+    realArches = archMapper "sm" cudaCapabilities;
+
+    # virtualArches :: List String
+    # The virtual architectures are typically used for forward compatibility, when trying to support
+    # an architecture newer than the CUDA version allows.
+    # E.g. [ "compute_75" "compute_86" ]
+    virtualArches = archMapper "compute" cudaCapabilities;
+
+    # arches :: List String
+    # By default, build for all supported architectures and forward compatibility via a virtual
+    # architecture for the newest supported architecture.
+    # E.g. [ "sm_75" "sm_86" "compute_86" ]
+    arches = realArches ++
+      lists.optional enableForwardCompat (lists.last virtualArches);
+
+    # gencode :: List String
+    # A list of CUDA gencode arguments to pass to NVCC.
+    # E.g. [ "-gencode=arch=compute_75,code=sm_75" ... "-gencode=arch=compute_86,code=compute_86" ]
+    gencode =
+      let
+        base = gencodeMapper "sm" cudaCapabilities;
+        forward = gencodeMapper "compute" [ (lists.last cudaCapabilities) ];
+      in
+      base ++ lib.optionals enableForwardCompat forward;
+  };
 
 in
+# When changing names or formats: pause, validate, and update the assert
+assert (formatCapabilities { cudaCapabilities = [ "7.5" "8.6" ]; }) == {
+  cudaCapabilities = [ "7.5" "8.6" ];
+  enableForwardCompat = true;
+
+  capabilitiesAndForward = [ "7.5" "8.6" "8.6+PTX" ];
+  forwardCapability = "8.6+PTX";
+
+  archNames = [ "Turing" "Ampere" ];
+  realArches = [ "sm_75" "sm_86" ];
+  virtualArches = [ "compute_75" "compute_86" ];
+  arches = [ "sm_75" "sm_86" "compute_86" ];
+
+  gencode = [ "-gencode=arch=compute_75,code=sm_75" "-gencode=arch=compute_86,code=sm_86" "-gencode=arch=compute_86,code=compute_86" ];
+};
 {
-  inherit
-    cudaArchNames
-    cudaArchNameToVersions cudaComputeCapabilityToName
-    cudaRealArches cudaVirtualArches cudaArches
-    cudaGencode;
-  cudaCapabilities = cudaComputeCapabilitiesAndForward;
+  # formatCapabilities :: { cudaCapabilities: List Capability, cudaForwardCompat: Boolean } ->  { ... }
+  inherit formatCapabilities;
+
+  # cudaArchNameToVersions :: String => String
+  inherit cudaArchNameToVersions;
+
+  # cudaComputeCapabilityToName :: String => String
+  inherit cudaComputeCapabilityToName;
+} // formatCapabilities {
+  cudaCapabilities = config.cudaCapabilities or supportedCapabilities;
+  enableForwardCompat = config.cudaForwardCompat or true;
 }
diff --git a/pkgs/development/libraries/opencv/4.x.nix b/pkgs/development/libraries/opencv/4.x.nix
index ac021c2b61082..a9f7b0304e84d 100644
--- a/pkgs/development/libraries/opencv/4.x.nix
+++ b/pkgs/development/libraries/opencv/4.x.nix
@@ -37,7 +37,7 @@
 , enableContrib ? true
 
 , enableCuda ? (config.cudaSupport or false) && stdenv.hostPlatform.isx86_64
-, cudatoolkit
+, cudaPackages ? { }
 , nvidia-optical-flow-sdk
 
 , enableUnfree ? false
@@ -79,6 +79,9 @@
 }:
 
 let
+  inherit (cudaPackages) cudatoolkit;
+  inherit (cudaPackages.cudaFlags) cudaCapabilities;
+
   version = "4.7.0";
 
   src = fetchFromGitHub {
@@ -342,6 +345,14 @@ stdenv.mkDerivation {
     "-DCUDA_FAST_MATH=ON"
     "-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
     "-DCUDA_NVCC_FLAGS=--expt-relaxed-constexpr"
+
+    # OpenCV respects at least three variables:
+    # -DCUDA_GENERATION takes a single arch name, e.g. Volta
+    # -DCUDA_ARCH_BIN takes a semi-colon separated list of real arches, e.g. "8.0;8.6"
+    # -DCUDA_ARCH_PTX takes the virtual arch, e.g. "8.6"
+    "-DCUDA_ARCH_BIN=${lib.concatStringsSep ";" cudaCapabilities}"
+    "-DCUDA_ARCH_PTX=${lib.last cudaCapabilities}"
+
     "-DNVIDIA_OPTICAL_FLOW_2_0_HEADERS_PATH=${nvidia-optical-flow-sdk}"
   ] ++ lib.optionals stdenv.isDarwin [
     "-DWITH_OPENCL=OFF"
diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix
index ab0a2125ec00e..e27107ca15d80 100644
--- a/pkgs/development/libraries/science/math/magma/generic.nix
+++ b/pkgs/development/libraries/science/math/magma/generic.nix
@@ -37,13 +37,13 @@ let
   #   lists.subtractLists a b = b - a
 
   # For CUDA
-  supportedCudaSmArches = lists.intersectLists cudaFlags.cudaRealArches supportedGpuTargets;
+  supportedCudaSmArches = lists.intersectLists cudaFlags.realArches supportedGpuTargets;
   # Subtract the supported SM architectures from the real SM architectures to get the unsupported
   # SM architectures.
-  unsupportedCudaSmArches = lists.subtractLists supportedCudaSmArches cudaFlags.cudaRealArches;
+  unsupportedCudaSmArches = lists.subtractLists supportedCudaSmArches cudaFlags.realArches;
 
   # For ROCm
-  # NOTE: The hip.gpuTargets are prefixed with "gfx" instead of "sm" like cudaFlags.cudaRealArches.
+  # NOTE: The hip.gpuTargets are prefixed with "gfx" instead of "sm" like cudaFlags.realArches.
   #   For some reason, Magma's CMakeLists.txt file does not handle the "gfx" prefix, so we must
   #   remove it.
   rocmArches = lists.map (x: strings.removePrefix "gfx" x) hip.gpuTargets;
diff --git a/pkgs/development/libraries/science/math/nccl/default.nix b/pkgs/development/libraries/science/math/nccl/default.nix
index 4f82de8e8b6fa..155e863bf21e4 100644
--- a/pkgs/development/libraries/science/math/nccl/default.nix
+++ b/pkgs/development/libraries/science/math/nccl/default.nix
@@ -10,7 +10,7 @@ with cudaPackages;
 
 let
   # Output looks like "-gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_86,code=compute_86"
-  gencode = lib.concatStringsSep " " cudaFlags.cudaGencode;
+  gencode = lib.concatStringsSep " " cudaFlags.gencode;
 in
 backendStdenv.mkDerivation rec {
   name = "nccl-${version}-cuda-${cudaPackages.cudaMajorVersion}";
diff --git a/pkgs/development/python-modules/jaxlib/default.nix b/pkgs/development/python-modules/jaxlib/default.nix
index 2c13defe43838..ad48af827ee56 100644
--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@@ -164,7 +164,7 @@ let
       build --action_env TF_CUDA_PATHS="${cudatoolkit_joined},${cudnn},${nccl}"
       build --action_env TF_CUDA_VERSION="${lib.versions.majorMinor cudatoolkit.version}"
       build --action_env TF_CUDNN_VERSION="${lib.versions.major cudnn.version}"
-      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${builtins.concatStringsSep "," cudaFlags.cudaRealArches}"
+      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${builtins.concatStringsSep "," cudaFlags.realArches}"
     '' + ''
       CFG
     '';

From 5f4bdbe6c387bf740025581d94bbfba9a887c76f Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Mon, 27 Feb 2023 16:28:07 +0200
Subject: [PATCH 06/13] python3Packages.tensorflow: fix `GLIBCXX_3.4.30' not
 found

Make tensorflow (and a bunch of ther things) use CUDA-compatible
toolchain. Introduces cudaPackages.backendStdenv
---
 .../compilers/cudatoolkit/common.nix          | 54 +++++++------------
 .../compilers/cudatoolkit/extension.nix       | 19 +++++--
 .../redist/build-cuda-redist-package.nix      | 11 ++--
 .../cudatoolkit/redist/overrides.nix          |  3 +-
 .../libraries/science/math/cudnn/generic.nix  |  8 +--
 .../science/math/tensorrt/generic.nix         |  8 +--
 .../python-modules/tensorflow/default.nix     | 46 +++++++++++-----
 .../cuda/cuda-library-samples/generic.nix     |  8 +--
 8 files changed, 88 insertions(+), 69 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/common.nix b/pkgs/development/compilers/cudatoolkit/common.nix
index a94f6fbdaf736..e6d7cbc377cf1 100644
--- a/pkgs/development/compilers/cudatoolkit/common.nix
+++ b/pkgs/development/compilers/cudatoolkit/common.nix
@@ -11,7 +11,7 @@ args@
 , fetchurl
 , fontconfig
 , freetype
-, gcc
+, gcc # :: String
 , gdk-pixbuf
 , glib
 , glibc
@@ -22,13 +22,13 @@ args@
 , perl
 , python3
 , requireFile
-, stdenv
+, backendStdenv # E.g. gcc11Stdenv, set in extension.nix
 , unixODBC
 , xorg
 , zlib
 }:
 
-stdenv.mkDerivation rec {
+backendStdenv.mkDerivation rec {
   pname = "cudatoolkit";
   inherit version runPatches;
 
@@ -146,37 +146,24 @@ stdenv.mkDerivation rec {
 
     # Fix builds with newer glibc version
     sed -i "1 i#define _BITS_FLOATN_H" "$out/include/host_defines.h"
-
-    # Ensure that cmake can find CUDA.
+  '' +
+  # Point NVCC at a compatible compiler
+  # FIXME: redist cuda_nvcc copy-pastes this code
+  # Refer to comments in the overrides for cuda_nvcc for explanation
+  # CUDA_TOOLKIT_ROOT_DIR is legacy,
+  # Cf. https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
+  ''
     mkdir -p $out/nix-support
-    echo "cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'" >> $out/nix-support/setup-hook
-
-    # Set the host compiler to be used by nvcc.
-    # FIXME: redist cuda_nvcc copy-pastes this code
-
-    # For CMake-based projects:
-    # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
-    # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
-    # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
-
-    # For non-CMake projects:
-    # FIXME: results in "incompatible redefinition" warnings ...but we keep
-    # both this and cmake variables until we come up with a more general
-    # solution
-    # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
-
     cat <<EOF >> $out/nix-support/setup-hook
-
-    cmakeFlags+=' -DCUDA_HOST_COMPILER=${gcc}/bin'
-    cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${gcc}/bin'
+    cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
+    cmakeFlags+=' -DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
+    cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
     if [ -z "\''${CUDAHOSTCXX-}" ]; then
-      export CUDAHOSTCXX=${gcc}/bin;
+      export CUDAHOSTCXX=${backendStdenv.cc}/bin;
     fi
-
-    export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${gcc}/bin'
+    export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${backendStdenv.cc}/bin'
     EOF
 
-
     # Move some libraries to the lib output so that programs that
     # depend on them don't pull in this entire monstrosity.
     mkdir -p $lib/lib
@@ -212,11 +199,10 @@ stdenv.mkDerivation rec {
 
       # The path to libstdc++ and such
       #
-      # NB:
-      # 1. "gcc" (gcc-wrapper) here is what's exposed as cudaPackages.cudatoolkit.cc
-      # 2. "gcc.cc" is the unwrapped gcc
-      # 3. "gcc.cc.lib" is one of its outputs
-      "${gcc.cc.lib}/lib64"
+      # `backendStdenv` is the cuda-compatible toolchain that we pick in
+      # extension.nix; we hand it to NVCC to use as a back-end, and we link
+      # cudatoolkit's binaries against its libstdc++
+      "${backendStdenv.cc.cc.lib}/lib64"
 
       "$out/jre/lib/amd64/jli"
       "$out/lib64"
@@ -286,7 +272,7 @@ stdenv.mkDerivation rec {
     popd
   '';
   passthru = {
-    cc = gcc;
+    cc = backendStdenv.cc;
     majorMinorVersion = lib.versions.majorMinor version;
     majorVersion = lib.versions.majorMinor version;
   };
diff --git a/pkgs/development/compilers/cudatoolkit/extension.nix b/pkgs/development/compilers/cudatoolkit/extension.nix
index c11f12b118a2f..72cab97f8ffc5 100644
--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@@ -7,11 +7,24 @@ final: prev: let
   # Version info for the classic cudatoolkit packages that contain everything that is in redist.
   cudatoolkitVersions = final.lib.importTOML ./versions.toml;
 
+  finalVersion = cudatoolkitVersions.${final.cudaVersion};
+
+  # Exposed as cudaPackages.backendStdenv.
+  # We don't call it just "stdenv" to avoid confusion: e.g. this toolchain doesn't contain nvcc.
+  # Instead, it's the back-end toolchain for nvcc to use.
+  # We also use this to link a compatible libstdc++ (backendStdenv.cc.cc.lib)
+  # Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context
+  backendStdenv = prev.pkgs."${finalVersion.gcc}Stdenv";
+
   ### Add classic cudatoolkit package
-  cudatoolkit = buildCudaToolkitPackage ((attrs: attrs // { gcc = prev.pkgs.${attrs.gcc}; }) cudatoolkitVersions.${final.cudaVersion});
+  cudatoolkit = buildCudaToolkitPackage (finalVersion // { inherit backendStdenv; });
 
   cudaFlags = final.callPackage ./flags.nix {};
 
-in {
-  inherit cudatoolkit cudaFlags;
+in
+{
+  inherit
+    backendStdenv
+    cudatoolkit
+    cudaFlags;
 }
diff --git a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
index 3bf9184eefabb..1b216ee625a89 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
@@ -1,5 +1,5 @@
 { lib
-, stdenv
+, backendStdenv
 , fetchurl
 , autoPatchelfHook
 , autoAddOpenGLRunpathHook
@@ -11,7 +11,7 @@ attrs:
 let
   arch = "linux-x86_64";
 in
-stdenv.mkDerivation {
+backendStdenv.mkDerivation {
   inherit pname;
   inherit (attrs) version;
 
@@ -33,11 +33,8 @@ stdenv.mkDerivation {
     # autoPatchelfHook will search for a libstdc++ and we're giving it a
     # "compatible" libstdc++ from the same toolchain that NVCC uses.
     #
-    # E.g. it might happen that stdenv=gcc12Stdenv, but we build against cuda11
-    # that only "supports" gcc11. Linking against gcc12's libraries we might
-    # sometimes actually sometimes encounter dynamic linkage errors at runtime
     # NB: We don't actually know if this is the right thing to do
-    cudatoolkit.cc.cc.lib
+    backendStdenv.cc.cc.lib
   ];
 
   dontBuild = true;
@@ -51,7 +48,7 @@ stdenv.mkDerivation {
     runHook postInstall
   '';
 
-  passthru.stdenv = stdenv;
+  passthru.stdenv = backendStdenv;
 
   meta = {
     description = attrs.name;
diff --git a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
index 663af1db7632b..96b782d8c990d 100644
--- a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
@@ -24,7 +24,7 @@ in
 
   cuda_nvcc = prev.cuda_nvcc.overrideAttrs (oldAttrs:
     let
-      inherit (prev.cudatoolkit) cc;
+      inherit (prev.backendStdenv) cc;
     in
     {
       # Point NVCC at a compatible compiler
@@ -44,7 +44,6 @@ in
       postInstall = (oldAttrs.postInstall or "") + ''
         mkdir -p $out/nix-support
         cat <<EOF >> $out/nix-support/setup-hook
-        cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
         cmakeFlags+=' -DCUDA_HOST_COMPILER=${cc}/bin'
         cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${cc}/bin'
         if [ -z "\''${CUDAHOSTCXX-}" ]; then
diff --git a/pkgs/development/libraries/science/math/cudnn/generic.nix b/pkgs/development/libraries/science/math/cudnn/generic.nix
index d4e1f641a956e..b2844ae6b074c 100644
--- a/pkgs/development/libraries/science/math/cudnn/generic.nix
+++ b/pkgs/development/libraries/science/math/cudnn/generic.nix
@@ -1,11 +1,11 @@
 {
-  stdenv,
+  backendStdenv,
   lib,
   zlib,
   useCudatoolkitRunfile ? false,
   cudaVersion,
   cudaMajorVersion,
-  cudatoolkit, # if cuda>=11: only used for .cc
+  cudatoolkit, # For cuda < 11
   libcublas ? null, # cuda <11 doesn't ship redist packages
   autoPatchelfHook,
   autoAddOpenGLRunpathHook,
@@ -26,7 +26,7 @@
   maxCudaVersion,
 }:
 assert useCudatoolkitRunfile || (libcublas != null); let
-  inherit (cudatoolkit) cc;
+  inherit (backendStdenv) cc;
   inherit (lib) lists strings trivial versions;
 
   # majorMinorPatch :: String -> String
@@ -46,7 +46,7 @@ assert useCudatoolkitRunfile || (libcublas != null); let
     then cudatoolkit
     else libcublas;
 in
-  stdenv.mkDerivation {
+  backendStdenv.mkDerivation {
     pname = "cudatoolkit-${cudaMajorVersion}-cudnn";
     version = versionTriple;
 
diff --git a/pkgs/development/libraries/science/math/tensorrt/generic.nix b/pkgs/development/libraries/science/math/tensorrt/generic.nix
index 3447087051f1e..31090f715c222 100644
--- a/pkgs/development/libraries/science/math/tensorrt/generic.nix
+++ b/pkgs/development/libraries/science/math/tensorrt/generic.nix
@@ -1,5 +1,5 @@
 { lib
-, stdenv
+, backendStdenv
 , requireFile
 , autoPatchelfHook
 , autoAddOpenGLRunpathHook
@@ -18,7 +18,7 @@
 assert lib.assertMsg (lib.strings.versionAtLeast cudnn.version fileVersionCudnn)
   "This version of TensorRT requires at least cuDNN ${fileVersionCudnn} (current version is ${cudnn.version})";
 
-stdenv.mkDerivation rec {
+backendStdenv.mkDerivation rec {
   pname = "cudatoolkit-${cudatoolkit.majorVersion}-tensorrt";
   version = fullVersion;
   src = requireFile rec {
@@ -45,7 +45,7 @@ stdenv.mkDerivation rec {
 
   # Used by autoPatchelfHook
   buildInputs = [
-    cudatoolkit.cc.cc.lib # libstdc++
+    backendStdenv.cc.cc.lib # libstdc++
     cudatoolkit
     cudnn
   ];
@@ -74,6 +74,8 @@ stdenv.mkDerivation rec {
         "$out/lib/libnvinfer_builder_resource.so.${mostOfVersion}"
     '';
 
+  passthru.stdenv = backendStdenv;
+
   meta = with lib; {
     # Check that the cudatoolkit version satisfies our min/max constraints (both
     # inclusive). We mark the package as broken if it fails to satisfies the
diff --git a/pkgs/development/python-modules/tensorflow/default.nix b/pkgs/development/python-modules/tensorflow/default.nix
index f18a924c31fa2..adc7b1c1e0b3f 100644
--- a/pkgs/development/python-modules/tensorflow/default.nix
+++ b/pkgs/development/python-modules/tensorflow/default.nix
@@ -32,6 +32,26 @@
 }:
 
 let
+  originalStdenv = stdenv;
+in
+let
+  # Tensorflow looks at many toolchain-related variables which may diverge.
+  #
+  # Toolchain for cuda-enabled builds.
+  # We want to achieve two things:
+  # 1. NVCC should use a compatible back-end (e.g. gcc11 for cuda11)
+  # 2. Normal C++ files should be compiled with the same toolchain,
+  #    to avoid potential weird dynamic linkage errors at runtime.
+  #    This may not be necessary though
+  #
+  # Toolchain for Darwin:
+  # clang 7 fails to emit a symbol for
+  # __ZN4llvm11SmallPtrSetIPKNS_10AllocaInstELj8EED1Ev in any of the
+  # translation units, so the build fails at link time
+  stdenv =
+    if cudaSupport then cudaPackages.backendStdenv
+    else if originalStdenv.isDarwin then llvmPackages_11.stdenv
+    else originalStdenv;
   inherit (cudaPackages) cudatoolkit cudnn nccl;
 in
 
@@ -44,6 +64,7 @@ assert ! (stdenv.isDarwin && cudaSupport);
 let
   withTensorboard = (pythonOlder "3.6") || tensorboardSupport;
 
+  # FIXME: migrate to redist cudaPackages
   cudatoolkit_joined = symlinkJoin {
     name = "${cudatoolkit.name}-merged";
     paths = [
@@ -56,10 +77,13 @@ let
     ];
   };
 
+  # Tensorflow expects bintools at hard-coded paths, e.g. /usr/bin/ar
+  # The only way to overcome that is to set GCC_HOST_COMPILER_PREFIX,
+  # but that path must contain cc as well, so we merge them
   cudatoolkit_cc_joined = symlinkJoin {
-    name = "${cudatoolkit.cc.name}-merged";
+    name = "${stdenv.cc.name}-merged";
     paths = [
-      cudatoolkit.cc
+      stdenv.cc
       binutils.bintools # for ar, dwp, nm, objcopy, objdump, strip
     ];
   };
@@ -175,12 +199,7 @@ let
     '';
   }) else _bazel-build;
 
-  _bazel-build = (buildBazelPackage.override (lib.optionalAttrs stdenv.isDarwin {
-    # clang 7 fails to emit a symbol for
-    # __ZN4llvm11SmallPtrSetIPKNS_10AllocaInstELj8EED1Ev in any of the
-    # translation units, so the build fails at link time
-    stdenv = llvmPackages_11.stdenv;
-  })) {
+  _bazel-build = buildBazelPackage.override { inherit stdenv; } {
     name = "${pname}-${version}";
     bazel = bazel_5;
 
@@ -211,12 +230,13 @@ let
       flatbuffers-core
       giflib
       grpc
-      icu
+      # Necessary to fix the "`GLIBCXX_3.4.30' not found" error
+      (icu.override { inherit stdenv; })
       jsoncpp
       libjpeg_turbo
       libpng
       lmdb-core
-      pybind11
+      (pybind11.overridePythonAttrs (_: { inherit stdenv; }))
       snappy
       sqlite
     ] ++ lib.optionals cudaSupport [
@@ -301,10 +321,12 @@ let
 
     TF_NEED_CUDA = tfFeature cudaSupport;
     TF_CUDA_PATHS = lib.optionalString cudaSupport "${cudatoolkit_joined},${cudnn},${nccl}";
-    GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
-    GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/gcc";
     TF_CUDA_COMPUTE_CAPABILITIES = lib.concatStringsSep "," cudaCapabilities;
 
+    # Needed even when we override stdenv: e.g. for ar
+    GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
+    GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/cc";
+
     postPatch = ''
       # bazel 3.3 should work just as well as bazel 3.1
       rm -f .bazelversion
diff --git a/pkgs/test/cuda/cuda-library-samples/generic.nix b/pkgs/test/cuda/cuda-library-samples/generic.nix
index e01664bab3191..e9a481c94a7a4 100644
--- a/pkgs/test/cuda/cuda-library-samples/generic.nix
+++ b/pkgs/test/cuda/cuda-library-samples/generic.nix
@@ -1,4 +1,4 @@
-{ lib, stdenv, fetchFromGitHub
+{ lib, backendStdenv, fetchFromGitHub
 , cmake, addOpenGLRunpath
 , cudatoolkit
 , cutensor
@@ -35,13 +35,13 @@ let
 in
 
 {
-  cublas = stdenv.mkDerivation (commonAttrs // {
+  cublas = backendStdenv.mkDerivation (commonAttrs // {
     pname = "cuda-library-samples-cublas";
 
     src = "${src}/cuBLASLt";
   });
 
-  cusolver = stdenv.mkDerivation (commonAttrs // {
+  cusolver = backendStdenv.mkDerivation (commonAttrs // {
     pname = "cuda-library-samples-cusolver";
 
     src = "${src}/cuSOLVER";
@@ -49,7 +49,7 @@ in
     sourceRoot = "cuSOLVER/gesv";
   });
 
-  cutensor = stdenv.mkDerivation (commonAttrs // {
+  cutensor = backendStdenv.mkDerivation (commonAttrs // {
     pname = "cuda-library-samples-cutensor";
 
     src = "${src}/cuTENSOR";

From 17248123b6ae01b89a25de730ea890276acd69b2 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Tue, 28 Feb 2023 18:07:45 +0200
Subject: [PATCH 07/13] cudaPackages_12: use gcc12

---
 pkgs/development/compilers/cudatoolkit/versions.toml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/versions.toml b/pkgs/development/compilers/cudatoolkit/versions.toml
index 7e9fcae3271ac..a201a4a263f5e 100644
--- a/pkgs/development/compilers/cudatoolkit/versions.toml
+++ b/pkgs/development/compilers/cudatoolkit/versions.toml
@@ -76,8 +76,4 @@ gcc = "gcc11"
 version = "12.0.1"
 url = "https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run"
 sha256 = "sha256-GyBaBicvFGP0dydv2rkD8/ZmkXwGjlIHOAAeacehh1s="
-# CUDA 12 is compatible with gcc12, but nixpkgs default gcc is still on gcc11 as
-# of 2023-01-08. See https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements.
-# This should be upgraded to gcc12 once nixpkgs default gcc is upgraded. Other
-# CUDA versions should likely have their gcc versions upgraded as well.
-gcc = "gcc11"
+gcc = "gcc12"

From 2b69d618c28bdcbc822843a534c2cb74542ec972 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Wed, 1 Mar 2023 23:52:07 +0200
Subject: [PATCH 08/13] opencv3: respect config.cudaCapabilities

---
 pkgs/development/libraries/opencv/3.x.nix | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pkgs/development/libraries/opencv/3.x.nix b/pkgs/development/libraries/opencv/3.x.nix
index e1a13c9fe573f..75d8c712df1f6 100644
--- a/pkgs/development/libraries/opencv/3.x.nix
+++ b/pkgs/development/libraries/opencv/3.x.nix
@@ -15,8 +15,8 @@
 , enableContrib   ? true
 
 , enableCuda      ? (config.cudaSupport or false) &&
-                    stdenv.hostPlatform.isx86_64, cudatoolkit
-
+                    stdenv.hostPlatform.isx86_64
+, cudaPackages ? { }
 , enableUnfree    ? false
 , enableIpp       ? false
 , enablePython    ? false, pythonPackages ? null
@@ -40,6 +40,9 @@ assert blas.implementation == "openblas" && lapack.implementation == "openblas";
 assert enablePython -> pythonPackages != null;
 
 let
+  inherit (cudaPackages) cudatoolkit;
+  inherit (cudaPackages.cudaFlags) cudaCapabilities;
+
   version = "3.4.18";
 
   src = fetchFromGitHub {
@@ -242,6 +245,8 @@ stdenv.mkDerivation {
     "-DCUDA_FAST_MATH=ON"
     "-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
     "-DCUDA_NVCC_FLAGS=--expt-relaxed-constexpr"
+    "-DCUDA_ARCH_BIN=${lib.concatStringsSep ";" cudaCapabilities}"
+    "-DCUDA_ARCH_PTX=${lib.last cudaCapabilities}"
   ] ++ lib.optionals stdenv.isDarwin [
     "-DWITH_OPENCL=OFF"
     "-DWITH_LAPACK=OFF"

From c376c54f70b91c68f6f2ddc90838b57a82b12ecd Mon Sep 17 00:00:00 2001
From: Someone <newkozlukov@gmail.com>
Date: Thu, 2 Mar 2023 17:47:47 +0000
Subject: [PATCH 09/13] cudaPackages.cudatoolkit: refactor inheriting
 passthru.cc

Co-authored-by: Connor Baker <ConnorBaker01@Gmail.com>
---
 pkgs/development/compilers/cudatoolkit/common.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkgs/development/compilers/cudatoolkit/common.nix b/pkgs/development/compilers/cudatoolkit/common.nix
index e6d7cbc377cf1..1195f7be7de63 100644
--- a/pkgs/development/compilers/cudatoolkit/common.nix
+++ b/pkgs/development/compilers/cudatoolkit/common.nix
@@ -272,7 +272,7 @@ backendStdenv.mkDerivation rec {
     popd
   '';
   passthru = {
-    cc = backendStdenv.cc;
+    inherit (backendStdenv) cc;
     majorMinorVersion = lib.versions.majorMinor version;
     majorVersion = lib.versions.majorMinor version;
   };

From 8bf5f5ac893ff07406a3a1979d944c2a86cfc887 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Fri, 3 Mar 2023 02:19:50 +0200
Subject: [PATCH 10/13] magma: use CMAKE_CUDA_ARCHITECTURES directly

---
 .../compilers/cudatoolkit/flags.nix           |  3 ++
 .../libraries/science/math/magma/generic.nix  | 42 ++++++++++++-------
 .../libraries/science/math/magma/releases.nix | 31 +-------------
 3 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/flags.nix b/pkgs/development/compilers/cudatoolkit/flags.nix
index a43485a7dcfd4..b65219369404f 100644
--- a/pkgs/development/compilers/cudatoolkit/flags.nix
+++ b/pkgs/development/compilers/cudatoolkit/flags.nix
@@ -160,6 +160,9 @@ assert (formatCapabilities { cudaCapabilities = [ "7.5" "8.6" ]; }) == {
 
   # cudaComputeCapabilityToName :: String => String
   inherit cudaComputeCapabilityToName;
+
+  # dropDot :: String -> String
+  inherit dropDot;
 } // formatCapabilities {
   cudaCapabilities = config.cudaCapabilities or supportedCapabilities;
   enableForwardCompat = config.cudaForwardCompat or true;
diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix
index e27107ca15d80..f61f1877019b2 100644
--- a/pkgs/development/libraries/science/math/magma/generic.nix
+++ b/pkgs/development/libraries/science/math/magma/generic.nix
@@ -11,7 +11,8 @@
 , cudaSupport ? true
 , fetchurl
 , gfortran
-, gpuTargets ? [ ]
+, cudaCapabilities ? cudaPackages.cudaFlags.cudaCapabilities
+, gpuTargets ? [ ] # Non-CUDA targets, that is HIP
 , hip
 , hipblas
 , hipsparse
@@ -36,12 +37,6 @@ let
   #   of the first list *from* the second list. That means:
   #   lists.subtractLists a b = b - a
 
-  # For CUDA
-  supportedCudaSmArches = lists.intersectLists cudaFlags.realArches supportedGpuTargets;
-  # Subtract the supported SM architectures from the real SM architectures to get the unsupported
-  # SM architectures.
-  unsupportedCudaSmArches = lists.subtractLists supportedCudaSmArches cudaFlags.realArches;
-
   # For ROCm
   # NOTE: The hip.gpuTargets are prefixed with "gfx" instead of "sm" like cudaFlags.realArches.
   #   For some reason, Magma's CMakeLists.txt file does not handle the "gfx" prefix, so we must
@@ -62,19 +57,32 @@ let
       )
       supported;
 
-  # Create the gpuTargetString.
   gpuTargetString = strings.concatStringsSep "," (
     if gpuTargets != [ ] then
     # If gpuTargets is specified, it always takes priority.
       gpuArchWarner supportedCustomGpuTargets unsupportedCustomGpuTargets
-    else if cudaSupport then
-      gpuArchWarner supportedCudaSmArches unsupportedCudaSmArches
     else if rocmSupport then
       gpuArchWarner supportedRocmArches unsupportedRocmArches
+    else if cudaSupport then
+      [ ] # It's important we pass explicit -DGPU_TARGET to reset magma's defaults
     else
       throw "No GPU targets specified"
   );
 
+  # E.g. [ "80" "86" "90" ]
+  cudaArchitectures = (builtins.map cudaFlags.dropDot cudaCapabilities);
+
+  cudaArchitecturesString = strings.concatStringsSep ";" cudaArchitectures;
+  minArch =
+    let
+      minArch' = builtins.head (builtins.sort builtins.lessThan cudaArchitectures);
+    in
+    # If this fails some day, something must've changed and we should re-validate our assumptions
+    assert builtins.stringLength minArch' == 2;
+    # "75" -> "750"  Cf. https://bitbucket.org/icl/magma/src/f4ec79e2c13a2347eff8a77a3be6f83bc2daec20/CMakeLists.txt#lines-273
+    "${minArch'}0";
+
+
   cuda_joined = symlinkJoin {
     name = "cuda-redist-${cudaVersion}";
     paths = with cudaPackages; [
@@ -87,6 +95,8 @@ let
   };
 in
 
+assert (builtins.match "[^[:space:]]*" gpuTargetString) != null;
+
 stdenv.mkDerivation {
   pname = "magma";
   inherit version;
@@ -116,7 +126,11 @@ stdenv.mkDerivation {
     openmp
   ];
 
-  cmakeFlags = lists.optionals cudaSupport [
+  cmakeFlags = [
+    "-DGPU_TARGET=${gpuTargetString}"
+  ] ++ lists.optionals cudaSupport [
+    "-DCMAKE_CUDA_ARCHITECTURES=${cudaArchitecturesString}"
+    "-DMIN_ARCH=${minArch}" # Disarms magma's asserts
     "-DCMAKE_C_COMPILER=${cudatoolkit.cc}/bin/cc"
     "-DCMAKE_CXX_COMPILER=${cudatoolkit.cc}/bin/c++"
     "-DMAGMA_ENABLE_CUDA=ON"
@@ -126,14 +140,10 @@ stdenv.mkDerivation {
     "-DMAGMA_ENABLE_HIP=ON"
   ];
 
-  # NOTE: We must set GPU_TARGET in preConfigure in this way because it may contain spaces.
-  preConfigure = ''
-    cmakeFlagsArray+=("-DGPU_TARGET=${gpuTargetString}")
-  ''
   # NOTE: The stdenv's CXX is used when compiling the CMake test to determine the version of
   #   CUDA available. This isn't necessarily the same as cudatoolkit.cc, so we must set
   #   CUDAHOSTCXX.
-  + strings.optionalString cudaSupport ''
+  preConfigure = strings.optionalString cudaSupport ''
     export CUDAHOSTCXX=${cudatoolkit.cc}/bin/c++
   '';
 
diff --git a/pkgs/development/libraries/science/math/magma/releases.nix b/pkgs/development/libraries/science/math/magma/releases.nix
index 3d08aa95d4d18..029f418edce3c 100644
--- a/pkgs/development/libraries/science/math/magma/releases.nix
+++ b/pkgs/development/libraries/science/math/magma/releases.nix
@@ -1,27 +1,13 @@
 # NOTE: Order matters! Put the oldest version first, and the newest version last.
 # NOTE: Make sure the supportedGpuTargets are in order of oldest to newest.
 #   You can update the supportedGpuTargets by looking at the CMakeLists.txt file.
-#   CUDA starts here: https://bitbucket.org/icl/magma/src/f4ec79e2c13a2347eff8a77a3be6f83bc2daec20/CMakeLists.txt#lines-175
 #   HIP is here: https://bitbucket.org/icl/magma/src/f4ec79e2c13a2347eff8a77a3be6f83bc2daec20/CMakeLists.txt#lines-386
+#   CUDA works around magma's wrappers and uses FindCUDAToolkit directly
 [
   {
     version = "2.6.2";
     hash = "sha256-dbVU2rAJA+LRC5cskT5Q5/iMvGLzrkMrWghsfk7aCnE=";
     supportedGpuTargets = [
-      "sm_20"
-      "sm_30"
-      "sm_35"
-      "sm_37"
-      "sm_50"
-      "sm_52"
-      "sm_53"
-      "sm_60"
-      "sm_61"
-      "sm_62"
-      "sm_70"
-      "sm_71"
-      "sm_75"
-      "sm_80"
       "700"
       "701"
       "702"
@@ -53,21 +39,6 @@
     version = "2.7.1";
     hash = "sha256-2chxHAR6OMrhbv3nS+4uszMyF/0nEeHpuGBsu7SuGlA=";
     supportedGpuTargets = [
-      "sm_20"
-      "sm_30"
-      "sm_35"
-      "sm_37"
-      "sm_50"
-      "sm_52"
-      "sm_53"
-      "sm_60"
-      "sm_61"
-      "sm_62"
-      "sm_70"
-      "sm_71"
-      "sm_75"
-      "sm_80"
-      "sm_90"
       "700"
       "701"
       "702"

From dd2b27692e8a32316d263b938bddfa515eb2775a Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Fri, 3 Mar 2023 03:23:40 +0200
Subject: [PATCH 11/13] magma: explain `cudaSupport ? true`

---
 pkgs/development/libraries/science/math/magma/generic.nix | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix
index f61f1877019b2..c997fcc090133 100644
--- a/pkgs/development/libraries/science/math/magma/generic.nix
+++ b/pkgs/development/libraries/science/math/magma/generic.nix
@@ -8,6 +8,11 @@
 { blas
 , cmake
 , cudaPackages
+  # FIXME: cuda being unfree means ofborg won't eval "magma".
+  # respecting config.cudaSupport -> false by default
+  # -> ofborg eval -> throws "no GPU targets specified".
+  # Probably should delete everything but "magma-cuda" and "magma-hip"
+  # from all-packages.nix
 , cudaSupport ? true
 , fetchurl
 , gfortran

From 0c25f5aa7ffb4ed9d4015fc273f51d08ff2a279b Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Fri, 3 Mar 2023 03:42:58 +0200
Subject: [PATCH 12/13] cudaPackages.cudatoolkit: remove unused gcc argument

---
 pkgs/development/compilers/cudatoolkit/common.nix    | 1 -
 pkgs/development/compilers/cudatoolkit/extension.nix | 7 ++++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/common.nix b/pkgs/development/compilers/cudatoolkit/common.nix
index 1195f7be7de63..fb3b50b981504 100644
--- a/pkgs/development/compilers/cudatoolkit/common.nix
+++ b/pkgs/development/compilers/cudatoolkit/common.nix
@@ -11,7 +11,6 @@ args@
 , fetchurl
 , fontconfig
 , freetype
-, gcc # :: String
 , gdk-pixbuf
 , glib
 , glibc
diff --git a/pkgs/development/compilers/cudatoolkit/extension.nix b/pkgs/development/compilers/cudatoolkit/extension.nix
index 72cab97f8ffc5..dd6f7ff2abe7b 100644
--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@@ -17,7 +17,12 @@ final: prev: let
   backendStdenv = prev.pkgs."${finalVersion.gcc}Stdenv";
 
   ### Add classic cudatoolkit package
-  cudatoolkit = buildCudaToolkitPackage (finalVersion // { inherit backendStdenv; });
+  cudatoolkit =
+    let
+      attrs = builtins.removeAttrs finalVersion [ "gcc" ];
+      attrs' = attrs // { inherit backendStdenv; };
+    in
+    buildCudaToolkitPackage attrs';
 
   cudaFlags = final.callPackage ./flags.nix {};
 

From ac64f07f9c8b9bcc4a4b6d285146cd50473d6b5d Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Fri, 3 Mar 2023 12:29:11 +0200
Subject: [PATCH 13/13] cudaPackages.cudaFlags: drop unused
 capabilitiesAndForward

---
 .../compilers/cudatoolkit/flags.nix           | 21 ++++---------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/pkgs/development/compilers/cudatoolkit/flags.nix b/pkgs/development/compilers/cudatoolkit/flags.nix
index b65219369404f..989fdb06c5dfb 100644
--- a/pkgs/development/compilers/cudatoolkit/flags.nix
+++ b/pkgs/development/compilers/cudatoolkit/flags.nix
@@ -19,13 +19,14 @@ let
   # passing a configuration based on your specific GPU environment.
   #
   # config.cudaCapabilities :: List Capability
-  # List of hardware generations to build
-  # Last item is considered the optional forward-compatibility arch
+  # List of hardware generations to build.
   # E.g. [ "8.0" ]
+  # Currently, the last item is considered the optional forward-compatibility arch,
+  # but this may change in the future.
   #
   # config.cudaForwardCompat :: Bool
   # Whether to include the forward compatibility gencode (+PTX)
-  # to support future GPU generations:
+  # to support future GPU generations.
   # E.g. true
   #
   # Please see the accompanying documentation or https://github.com/NixOS/nixpkgs/pull/205351
@@ -91,17 +92,6 @@ let
   formatCapabilities = { cudaCapabilities, enableForwardCompat ? true }: rec {
     inherit cudaCapabilities enableForwardCompat;
 
-    # forwardCapability :: String
-    # Forward "compute" capability, a.k.a PTX
-    # E.g. "8.6+PTX"
-    forwardCapability = (lists.last cudaCapabilities) + "+PTX";
-
-    # capabilitiesAndForward :: List String
-    # The list of supported CUDA architectures, including the forward compatibility architecture.
-    # If forward compatibility is disabled, this will be the same as cudaCapabilities.
-    # E.g. [ "7.5" "8.6" "8.6+PTX" ]
-    capabilitiesAndForward = cudaCapabilities ++ lists.optionals enableForwardCompat [ forwardCapability ];
-
     # archNames :: List String
     # E.g. [ "Turing" "Ampere" ]
     archNames = lists.unique (builtins.map (cap: cudaComputeCapabilityToName.${cap}) cudaCapabilities);
@@ -141,9 +131,6 @@ assert (formatCapabilities { cudaCapabilities = [ "7.5" "8.6" ]; }) == {
   cudaCapabilities = [ "7.5" "8.6" ];
   enableForwardCompat = true;
 
-  capabilitiesAndForward = [ "7.5" "8.6" "8.6+PTX" ];
-  forwardCapability = "8.6+PTX";
-
   archNames = [ "Turing" "Ampere" ];
   realArches = [ "sm_75" "sm_86" ];
   virtualArches = [ "compute_75" "compute_86" ];