From 3de958cac777254b4e15d2cc3e9d3b61d2dbce46 Mon Sep 17 00:00:00 2001 From: Someone Serge Date: Fri, 15 Apr 2022 06:01:32 +0300 Subject: [PATCH 1/8] python3Packages.pytorch: migrate to cudaPackages Use the redistributable cuda packages, instead of runfile-based cudatoolkit --- pkgs/development/libraries/mpich/default.nix | 5 +++ .../libraries/science/math/magma/default.nix | 5 ++- pkgs/development/libraries/ucx/default.nix | 10 ++++- .../python-modules/pytorch/default.nix | 37 ++++++++++++++----- 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/pkgs/development/libraries/mpich/default.nix b/pkgs/development/libraries/mpich/default.nix index 60e543c462960..ad480e134cd20 100644 --- a/pkgs/development/libraries/mpich/default.nix +++ b/pkgs/development/libraries/mpich/default.nix @@ -42,6 +42,11 @@ stdenv.mkDerivation rec { sed -i 's:FC="gfortran":FC=${gfortran}/bin/gfortran:' $out/bin/mpifort ''; + passthru = { + cudaSupport = ch4backend.cudaSupport or false; + cudaPackages = ch4backend.cudaPackages or { }; + }; + meta = with lib; { description = "Implementation of the Message Passing Interface (MPI) standard"; diff --git a/pkgs/development/libraries/science/math/magma/default.nix b/pkgs/development/libraries/science/math/magma/default.nix index 05d7d4fa18424..73a81cb19445e 100644 --- a/pkgs/development/libraries/science/math/magma/default.nix +++ b/pkgs/development/libraries/science/math/magma/default.nix @@ -72,5 +72,8 @@ in stdenv.mkDerivation { maintainers = with maintainers; [ tbenst ]; }; - passthru.cudatoolkit = cudatoolkit; + passthru = { + # TODO: leave just cudaPackages + inherit cudatoolkit cudaPackages; + }; } diff --git a/pkgs/development/libraries/ucx/default.nix b/pkgs/development/libraries/ucx/default.nix index 0dc79d47dda7b..96dad0e4b3570 100644 --- a/pkgs/development/libraries/ucx/default.nix +++ b/pkgs/development/libraries/ucx/default.nix @@ -1,10 +1,13 @@ { lib, stdenv, fetchFromGitHub, autoreconfHook, doxygen , numactl, rdma-core, libbfd, libiberty, perl, zlib, symlinkJoin , enableCuda ? false -, cudatoolkit +, cudaPackages }: let + # TODO: use the redistributable cuda packages instead + inherit (cudaPackages) cudatoolkit; + # Needed for configure to find all libraries cudatoolkit' = symlinkJoin { inherit (cudatoolkit) name meta; @@ -43,6 +46,11 @@ in stdenv.mkDerivation rec { enableParallelBuilding = true; + passthru = { + cudaSupport = enableCuda; + inherit cudaPackages; + }; + meta = with lib; { description = "Unified Communication X library"; homepage = "http://www.openucx.org"; diff --git a/pkgs/development/python-modules/pytorch/default.nix b/pkgs/development/python-modules/pytorch/default.nix index 810460a30adea..f4f0d9f56234e 100644 --- a/pkgs/development/python-modules/pytorch/default.nix +++ b/pkgs/development/python-modules/pytorch/default.nix @@ -30,23 +30,42 @@ isPy3k, pythonOlder }: let - inherit (cudaPackages) cudatoolkit cudnn nccl; + inherit (cudaPackages) cudnn nccl; in # assert that everything needed for cuda is present and that the correct cuda versions are used -assert !cudaSupport || (let majorIs = lib.versions.major cudatoolkit.version; +assert cudaSupport -> (let majorIs = cudaPackages.cudaMajorVersion; in majorIs == "9" || majorIs == "10" || majorIs == "11"); # confirm that cudatoolkits are sync'd across dependencies -assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit; -assert !cudaSupport || magma.cudatoolkit == cudatoolkit; +# TODO: verify it's OK to equality-compare cudaPackages (which are attrsets, scopes, or whatever) +assert (MPISupport && cudaSupport) -> mpi.cudaPackages == cudaPackages; +assert cudaSupport -> (magma.cudaPackages == cudaPackages); let setBool = v: if v then "1" else "0"; + + # cudatoolkit.cc is a passthru attribute + # that points at a compatible gcc version, + # we're not really using cudatoolkit; + # not sure we really need to hard-code gcc, + # but this is out of scope of current PR + # (which is: migrating pytorch to redist cuda) + inherit (cudaPackages.cudatoolkit) cc; + cudatoolkit_joined = symlinkJoin { - name = "${cudatoolkit.name}-unsplit"; + name = "cudatoolkit-root"; # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs - paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ]; + paths = with cudaPackages; [ + cuda_nvcc + cuda_cudart + libcublas + libcufft + libcusolver + libcusparse + nccl.dev + nccl.out + ]; }; # Give an explicit list of supported architectures for the build, See: @@ -105,7 +124,7 @@ let final_cudaArchList = if !cudaSupport || cudaArchList != null then cudaArchList - else cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}"; + else cudaCapabilities."cuda${cudaPackages.cudaMajorVersion}"; # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via # LD_LIBRARY_PATH=/run/opengl-driver/lib. We only use the stub @@ -113,7 +132,7 @@ let # to recompile pytorch on every update to nvidia-x11 or the kernel. cudaStub = linkFarm "cuda-stub" [{ name = "libcuda.so.1"; - path = "${cudatoolkit}/lib/stubs/libcuda.so"; + path = "${cudaPackages.cuda_cudart}/lib/stubs/libcuda.so"; }]; cudaStubEnv = lib.optionalString cudaSupport "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH "; @@ -156,7 +175,7 @@ in buildPythonPackage rec { preConfigure = lib.optionalString cudaSupport '' export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}" - export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++ + export CC=${cc}/bin/gcc CXX=${cc}/bin/g++ '' + lib.optionalString (cudaSupport && cudnn != null) '' export CUDNN_INCLUDE_DIR=${cudnn}/include ''; From 0f8accbe0b469b4de5313869769e808b52080210 Mon Sep 17 00:00:00 2001 From: Someone Serge Date: Fri, 15 Apr 2022 21:10:01 +0300 Subject: [PATCH 2/8] python3Packages.pytorch: actually use redist cuda! --- .../python-modules/pytorch/default.nix | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pkgs/development/python-modules/pytorch/default.nix b/pkgs/development/python-modules/pytorch/default.nix index f4f0d9f56234e..195dc5619adfe 100644 --- a/pkgs/development/python-modules/pytorch/default.nix +++ b/pkgs/development/python-modules/pytorch/default.nix @@ -55,17 +55,33 @@ let cudatoolkit_joined = symlinkJoin { name = "cudatoolkit-root"; + + # The list of required cuda redist packages can be found e.g. + # at https://github.com/pytorch/pytorch/blob/b09769992f83f94150eaef2ab9d03c37b36da159/cmake/Summary.cmake#L85 + # Not explicitly listed there are: nvml, nvtx, cccl + # # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs paths = with cudaPackages; [ cuda_nvcc + cuda_nvml_dev # + cuda_nvtx # -llibNVToolsExt + cuda_cccl # + cuda_nvprof # + cuda_nvrtc cuda_cudart libcublas libcufft libcusolver libcusparse + libcurand nccl.dev nccl.out ]; + + # ld is going to look for static archives (e.g. libcudart_static.a) in lib64 + postBuild = '' + ln -s $out/lib $out/lib64 + ''; }; # Give an explicit list of supported architectures for the build, See: @@ -173,8 +189,10 @@ in buildPythonPackage rec { ./pthreadpool-disable-gcd.diff ]; + # CUDAHOSTCXX goes into nvcc's -ccbin argument preConfigure = lib.optionalString cudaSupport '' export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}" + export CUDAHOSTCXX=${cc}/bin/ export CC=${cc}/bin/gcc CXX=${cc}/bin/g++ '' + lib.optionalString (cudaSupport && cudnn != null) '' export CUDNN_INCLUDE_DIR=${cudnn}/include From 721ed638693a839d256cb71e35c7ce6d9b98d628 Mon Sep 17 00:00:00 2001 From: Someone Serge Date: Wed, 20 Apr 2022 23:29:06 +0300 Subject: [PATCH 3/8] python3Packages.pytorch: symlinkJoin with cudaVersion --- pkgs/development/python-modules/pytorch/default.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkgs/development/python-modules/pytorch/default.nix b/pkgs/development/python-modules/pytorch/default.nix index 195dc5619adfe..817dcd1b44d47 100644 --- a/pkgs/development/python-modules/pytorch/default.nix +++ b/pkgs/development/python-modules/pytorch/default.nix @@ -54,7 +54,7 @@ let inherit (cudaPackages.cudatoolkit) cc; cudatoolkit_joined = symlinkJoin { - name = "cudatoolkit-root"; + name = "cudatoolkit-root-${cudaPackages.cudaVersion}"; # The list of required cuda redist packages can be found e.g. # at https://github.com/pytorch/pytorch/blob/b09769992f83f94150eaef2ab9d03c37b36da159/cmake/Summary.cmake#L85 From a632b81bc32a7f4e4c21dd86c452cec649d67071 Mon Sep 17 00:00:00 2001 From: Someone Serge Date: Fri, 22 Apr 2022 01:56:04 +0300 Subject: [PATCH 4/8] python3Packages.pytorch: comment on why cudart --- pkgs/development/python-modules/pytorch/default.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkgs/development/python-modules/pytorch/default.nix b/pkgs/development/python-modules/pytorch/default.nix index 817dcd1b44d47..b0c213f10d7ca 100644 --- a/pkgs/development/python-modules/pytorch/default.nix +++ b/pkgs/development/python-modules/pytorch/default.nix @@ -68,7 +68,7 @@ let cuda_cccl # cuda_nvprof # cuda_nvrtc - cuda_cudart + cuda_cudart # cuda_runtime.h libcublas libcufft libcusolver From 917a8bc0edbab7fc3c1f882374d49c3750dfeffc Mon Sep 17 00:00:00 2001 From: Someone Serge Date: Fri, 22 Apr 2022 01:56:22 +0300 Subject: [PATCH 5/8] python3Packages.pytorch: .dev->lib.getDev ensures stability in case outputs are removed later --- pkgs/development/python-modules/pytorch/default.nix | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkgs/development/python-modules/pytorch/default.nix b/pkgs/development/python-modules/pytorch/default.nix index b0c213f10d7ca..db5557c4c6784 100644 --- a/pkgs/development/python-modules/pytorch/default.nix +++ b/pkgs/development/python-modules/pytorch/default.nix @@ -74,8 +74,8 @@ let libcusolver libcusparse libcurand - nccl.dev - nccl.out + (lib.getDev nccl) + nccl ]; # ld is going to look for static archives (e.g. libcudart_static.a) in lib64 From b381d408adb1507580920c00424ceedb73591089 Mon Sep 17 00:00:00 2001 From: Someone Serge Date: Sat, 23 Apr 2022 03:49:09 +0300 Subject: [PATCH 6/8] python3Packages.pytorch: add libcupti for kineto profiler gpu support --- pkgs/development/python-modules/pytorch/default.nix | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkgs/development/python-modules/pytorch/default.nix b/pkgs/development/python-modules/pytorch/default.nix index db5557c4c6784..a9f040016b1a7 100644 --- a/pkgs/development/python-modules/pytorch/default.nix +++ b/pkgs/development/python-modules/pytorch/default.nix @@ -54,7 +54,7 @@ let inherit (cudaPackages.cudatoolkit) cc; cudatoolkit_joined = symlinkJoin { - name = "cudatoolkit-root-${cudaPackages.cudaVersion}"; + name = "cuda-redist-${cudaPackages.cudaVersion}"; # The list of required cuda redist packages can be found e.g. # at https://github.com/pytorch/pytorch/blob/b09769992f83f94150eaef2ab9d03c37b36da159/cmake/Summary.cmake#L85 @@ -74,6 +74,7 @@ let libcusolver libcusparse libcurand + cuda_cupti (lib.getDev nccl) nccl ]; From 317b92ef60e4488d6f0f4f0045b343309e5fad33 Mon Sep 17 00:00:00 2001 From: Someone Serge Date: Sat, 23 Apr 2022 14:40:30 +0300 Subject: [PATCH 7/8] magma: migrate to redist cuda packages --- .../libraries/science/math/magma/default.nix | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/pkgs/development/libraries/science/math/magma/default.nix b/pkgs/development/libraries/science/math/magma/default.nix index 73a81cb19445e..27070b655611e 100644 --- a/pkgs/development/libraries/science/math/magma/default.nix +++ b/pkgs/development/libraries/science/math/magma/default.nix @@ -1,10 +1,6 @@ -{ lib, stdenv, fetchurl, cmake, gfortran, ninja, cudaPackages, libpthreadstubs, lapack, blas }: +{ lib, stdenv, fetchurl, cmake, gfortran, ninja, cudaPackages, libpthreadstubs, lapack, blas, symlinkJoin }: -let - inherit (cudaPackages) cudatoolkit; -in - -assert let majorIs = lib.versions.major cudatoolkit.version; +assert let majorIs = cudaPackages.cudaMajorVersion; in majorIs == "9" || majorIs == "10" || majorIs == "11"; let @@ -37,8 +33,22 @@ let ]; }; + inherit (cudaPackages) cudaMajorVersion; + inherit (cudaPackages.cudatoolkit) cc; + + cuda_joined = symlinkJoin { + name = "cuda-redist-${cudaPackages.cudaVersion}"; + paths = with cudaPackages; [ + cuda_nvcc + cuda_cudart # cuda_runtime.h + libcublas + libcusparse + cuda_nvprof # + ]; + }; + capabilityString = lib.strings.concatStringsSep "," - cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}"; + cudaCapabilities."cuda${cudaMajorVersion}"; in stdenv.mkDerivation { pname = "magma"; @@ -51,14 +61,14 @@ in stdenv.mkDerivation { nativeBuildInputs = [ gfortran cmake ninja ]; - buildInputs = [ cudatoolkit libpthreadstubs lapack blas ]; + buildInputs = [ libpthreadstubs lapack blas cuda_joined ]; cmakeFlags = [ "-DGPU_TARGET=${capabilityString}" ]; doCheck = false; preConfigure = '' - export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++ + export CC=${cc}/bin/gcc CXX=${cc}/bin/g++ ''; enableParallelBuilding=true; @@ -73,7 +83,6 @@ in stdenv.mkDerivation { }; passthru = { - # TODO: leave just cudaPackages - inherit cudatoolkit cudaPackages; + inherit cudaPackages; }; } From befe56a1ee1d383fafaf9db41e3f4fc506578da1 Mon Sep 17 00:00:00 2001 From: Someone Serge Date: Sun, 24 Apr 2022 15:23:29 +0300 Subject: [PATCH 8/8] python3Packages.pytorch: comment on cudaPackages equality --- pkgs/development/python-modules/pytorch/default.nix | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkgs/development/python-modules/pytorch/default.nix b/pkgs/development/python-modules/pytorch/default.nix index a9f040016b1a7..ff92c854a9a87 100644 --- a/pkgs/development/python-modules/pytorch/default.nix +++ b/pkgs/development/python-modules/pytorch/default.nix @@ -37,8 +37,9 @@ in assert cudaSupport -> (let majorIs = cudaPackages.cudaMajorVersion; in majorIs == "9" || majorIs == "10" || majorIs == "11"); -# confirm that cudatoolkits are sync'd across dependencies -# TODO: verify it's OK to equality-compare cudaPackages (which are attrsets, scopes, or whatever) +# We expect referential equality of all cudaPackages used to ensure consistency +# You can make an overlay and pass the same cudaPackages to pytorch, mpi, and magma +# TODO: `==` is an implementation detail; move comparison logic to cudaPackages assert (MPISupport && cudaSupport) -> mpi.cudaPackages == cudaPackages; assert cudaSupport -> (magma.cudaPackages == cudaPackages);