diff --git a/pkgs/development/compilers/cudatoolkit/common.nix b/pkgs/development/compilers/cudatoolkit/common.nix index 2e15012452a80..617c2fd995808 100644 --- a/pkgs/development/compilers/cudatoolkit/common.nix +++ b/pkgs/development/compilers/cudatoolkit/common.nix @@ -24,6 +24,7 @@ args@ , libkrb5 , krb5 , makeWrapper +, markForCudatoolkitRootHook , ncurses5 , numactl , nss @@ -31,6 +32,7 @@ args@ , python3 # FIXME: CUDAToolkit 10 may still need python27 , pulseaudio , requireFile +, setupCudaHook , stdenv , backendStdenv # E.g. gcc11Stdenv, set in extension.nix , unixODBC @@ -80,11 +82,15 @@ backendStdenv.mkDerivation rec { addOpenGLRunpath autoPatchelfHook autoAddOpenGLRunpathHook + markForCudatoolkitRootHook ] ++ lib.optionals (lib.versionOlder version "11") [ libsForQt5.wrapQtAppsHook ] ++ lib.optionals (lib.versionAtLeast version "11.8") [ qt6Packages.wrapQtAppsHook ]; + depsTargetTargetPropagated = [ + setupCudaHook + ]; buildInputs = lib.optionals (lib.versionOlder version "11") [ libsForQt5.qt5.qtwebengine freeglut @@ -280,24 +286,12 @@ backendStdenv.mkDerivation rec { sed -i "1 i#define _BITS_FLOATN_H" "$out/include/host_defines.h" '' + # Point NVCC at a compatible compiler - # FIXME: redist cuda_nvcc copy-pastes this code - # Refer to comments in the overrides for cuda_nvcc for explanation # CUDA_TOOLKIT_ROOT_DIR is legacy, # Cf. https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables - # NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the compiled - # binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as - # the default set of CUDA capabilities we build can regularly cause this to occur (for - # example, with Magma). '' mkdir -p $out/nix-support cat <> $out/nix-support/setup-hook cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out' - cmakeFlags+=' -DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin' - cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${backendStdenv.cc}/bin' - if [ -z "\''${CUDAHOSTCXX-}" ]; then - export CUDAHOSTCXX=${backendStdenv.cc}/bin; - fi - export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${backendStdenv.cc}/bin -Xfatbin=-compress-all' EOF # Move some libraries to the lib output so that programs that diff --git a/pkgs/development/compilers/cudatoolkit/extension.nix b/pkgs/development/compilers/cudatoolkit/extension.nix index f14a55aa6cf71..93800a0dbc6b1 100644 --- a/pkgs/development/compilers/cudatoolkit/extension.nix +++ b/pkgs/development/compilers/cudatoolkit/extension.nix @@ -38,10 +38,39 @@ final: prev: let cudaFlags = final.callPackage ./flags.nix {}; + # Internal hook, used by cudatoolkit and cuda redist packages + # to accommodate automatic CUDAToolkit_ROOT construction + markForCudatoolkitRootHook = (final.callPackage + ({ makeSetupHook }: + makeSetupHook + { name = "mark-for-cudatoolkit-root-hook"; } + ./hooks/mark-for-cudatoolkit-root-hook.sh) + { }); + + # Normally propagated by cuda_nvcc or cudatoolkit through their depsHostHostPropagated + setupCudaHook = (final.callPackage + ({ makeSetupHook, backendStdenv }: + makeSetupHook + { + name = "setup-cuda-hook"; + + substitutions.ccRoot = "${backendStdenv.cc}"; + + # Required in addition to ccRoot as otherwise bin/gcc is looked up + # when building CMakeCUDACompilerId.cu + substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++"; + } + ./hooks/setup-cuda-hook.sh) + { }); + in { inherit backendStdenv cudatoolkit - cudaFlags; + cudaFlags + markForCudatoolkitRootHook + setupCudaHook; + + saxpy = final.callPackage ./saxpy { }; } diff --git a/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh new file mode 100644 index 0000000000000..5c18760a3a2b0 --- /dev/null +++ b/pkgs/development/compilers/cudatoolkit/hooks/mark-for-cudatoolkit-root-hook.sh @@ -0,0 +1,8 @@ +# shellcheck shell=bash + +markForCUDAToolkit_ROOT() { + mkdir -p "${prefix}/nix-support" + touch "${prefix}/nix-support/include-in-cudatoolkit-root" +} + +fixupOutputHooks+=(markForCUDAToolkit_ROOT) diff --git a/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh new file mode 100644 index 0000000000000..89801eb7c21a5 --- /dev/null +++ b/pkgs/development/compilers/cudatoolkit/hooks/nvcc-setup-hook.sh @@ -0,0 +1,5 @@ +# shellcheck shell=bash + +# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for +# CUDAToolkit_ROOT. We have to help it locate libcudart +export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include" diff --git a/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh new file mode 100644 index 0000000000000..89256e86f7898 --- /dev/null +++ b/pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh @@ -0,0 +1,68 @@ +# shellcheck shell=bash + +echo Sourcing setup-cuda-hook >&2 + +extendCUDAToolkit_ROOT() { + if [[ -f "$1/nix-support/include-in-cudatoolkit-root" ]] ; then + addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$1" + + if [[ -d "$1/include" ]] ; then + addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$1/include" + fi + fi +} + +addEnvHooks "$targetOffset" extendCUDAToolkit_ROOT + +setupCUDAToolkitCompilers() { + echo Executing setupCUDAToolkitCompilers >&2 + + if [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] ; then + return + fi + + # Point NVCC at a compatible compiler + + # For CMake-based projects: + # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables + # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html + # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html + + export cmakeFlags+=" -DCUDA_HOST_COMPILER=@ccFullPath@" + export cmakeFlags+=" -DCMAKE_CUDA_HOST_COMPILER=@ccFullPath@" + + # For non-CMake projects: + # We prepend --compiler-bindir to nvcc flags. + # Downstream packages can override these, because NVCC + # uses the last --compiler-bindir it gets on the command line. + # FIXME: this results in "incompatible redefinition" warnings. + # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin + if [ -z "${CUDAHOSTCXX-}" ]; then + export CUDAHOSTCXX="@ccFullPath@"; + fi + + export NVCC_PREPEND_FLAGS+=" --compiler-bindir=@ccRoot@/bin" + + # NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled + # binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as + # the default set of CUDA capabilities we build can regularly cause this to occur (for + # example, with Magma). + # + # @SomeoneSerge: original comment was made by @ConnorBaker in .../cudatoolkit/common.nix + if [[ -z "${dontCompressFatbin-}" ]]; then + export NVCC_PREPEND_FLAGS+=" -Xfatbin=-compress-all" + fi + + # CMake's enable_language(CUDA) runs a compiler test and it doesn't account for + # CUDAToolkit_ROOT. We have to help it locate libcudart + if [[ -z "${nvccDontPrependCudartFlags-}" ]] ; then + export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include" + fi +} + +setupCMakeCUDAToolkit_ROOT() { + export cmakeFlags+=" -DCUDAToolkit_INCLUDE_DIR=$CUDAToolkit_INCLUDE_DIR -DCUDAToolkit_ROOT=$CUDAToolkit_ROOT" +} + +postHooks+=(setupCUDAToolkitCompilers) +preConfigureHooks+=(setupCMakeCUDAToolkit_ROOT) diff --git a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix index 3a6a16b1d7698..ec2c9cf72a9e2 100644 --- a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix +++ b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix @@ -4,6 +4,7 @@ , fetchurl , autoPatchelfHook , autoAddOpenGLRunpathHook +, markForCudatoolkitRootHook }: pname: @@ -28,6 +29,7 @@ backendStdenv.mkDerivation { # directory to the rpath of all ELF binaries. # Check e.g. with `patchelf --print-rpath path/to/my/binary autoAddOpenGLRunpathHook + markForCudatoolkitRootHook ]; buildInputs = [ diff --git a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix index 7b8e02de24741..b962b6caa1b27 100644 --- a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix +++ b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix @@ -27,35 +27,27 @@ in inherit (prev.backendStdenv) cc; in { - # Point NVCC at a compatible compiler - # FIXME: non-redist cudatoolkit copy-pastes this code + # Required by cmake's enable_language(CUDA) to build a test program + # When implementing cross-compilation support: this is + # final.pkgs.targetPackages.cudaPackages.cuda_cudart + env.cudartRoot = "${prev.lib.getDev final.cuda_cudart}"; - # For CMake-based projects: - # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables - # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html - # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html + # Point NVCC at a compatible compiler - # For non-CMake projects: - # We prepend --compiler-bindir to nvcc flags. - # Downstream packages can override these, because NVCC - # uses the last --compiler-bindir it gets on the command line. - # FIXME: this results in "incompatible redefinition" warnings. - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin - # NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the - # compiled binaries. If binaries grow over 2GB, they will fail to link. This is a problem - # for us, as the default set of CUDA capabilities we build can regularly cause this to - # occur (for example, with Magma). - postInstall = (oldAttrs.postInstall or "") + '' - mkdir -p $out/nix-support - cat <> $out/nix-support/setup-hook - cmakeFlags+=' -DCUDA_HOST_COMPILER=${cc}/bin' - cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${cc}/bin' - if [ -z "\''${CUDAHOSTCXX-}" ]; then - export CUDAHOSTCXX=${cc}/bin; - fi - export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${cc}/bin -Xfatbin=-compress-all' - EOF - ''; + # Desiredata: whenever a package (e.g. magma) adds cuda_nvcc to + # nativeBuildInputs (offsets `(-1, 0)`), magma should also source the + # setupCudaHook, i.e. we want it the hook to be propagated into the + # same nativeBuildInputs. + # + # Logically, cuda_nvcc should include the hook in depsHostHostPropagated, + # so that the final offsets for the propagated hook would be `(-1, 0) + + # (0, 0) = (-1, 0)`. + # + # In practice, TargetTarget appears to work: + # https://gist.github.com/fd80ff142cd25e64603618a3700e7f82 + depsTargetTargetPropagated = [ + final.setupCudaHook + ]; }); cuda_nvprof = prev.cuda_nvprof.overrideAttrs (oldAttrs: { diff --git a/pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt b/pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt new file mode 100644 index 0000000000000..a6954e6e8bee2 --- /dev/null +++ b/pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.25) +project(saxpy LANGUAGES CXX CUDA) + +find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas) + +add_executable(saxpy saxpy.cu) +target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m) +target_compile_features(saxpy PRIVATE cxx_std_14) +target_compile_options(saxpy PRIVATE $<$: + --expt-relaxed-constexpr>) + +install(TARGETS saxpy) diff --git a/pkgs/development/compilers/cudatoolkit/saxpy/default.nix b/pkgs/development/compilers/cudatoolkit/saxpy/default.nix new file mode 100644 index 0000000000000..f347b43d1d11c --- /dev/null +++ b/pkgs/development/compilers/cudatoolkit/saxpy/default.nix @@ -0,0 +1,50 @@ +{ autoAddOpenGLRunpathHook +, backendStdenv +, cmake +, cuda_cccl +, cuda_cudart +, cudaFlags +, cuda_nvcc +, lib +, libcublas +, setupCudaHook +, stdenv +}: + +backendStdenv.mkDerivation { + pname = "saxpy"; + version = "unstable-2023-07-11"; + + src = ./.; + + buildInputs = [ + libcublas + cuda_cudart + cuda_cccl + ]; + nativeBuildInputs = [ + cmake + + # NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for + # cross-compilation to work. This should work automatically once we move to + # spliced scopes. Delete this comment once that happens + cuda_nvcc + + # Alternatively, we could remove the propagated hook from cuda_nvcc and add + # directly: + # setupCudaHook + autoAddOpenGLRunpathHook + ]; + + cmakeFlags = [ + "-DCMAKE_VERBOSE_MAKEFILE=ON" + "-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}" + ]; + + meta = { + description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation"; + license = lib.licenses.mit; + maintainers = lib.teams.cuda.members; + platforms = lib.platforms.unix; + }; +} diff --git a/pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu b/pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu new file mode 100644 index 0000000000000..912a6d1647b14 --- /dev/null +++ b/pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu @@ -0,0 +1,68 @@ +#include +#include +#include + +#include + +static inline void check(cudaError_t err, const char *context) { + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err)); + std::exit(EXIT_FAILURE); + } +} + +#define CHECK(x) check(x, #x) + +__global__ void saxpy(int n, float a, float *x, float *y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + y[i] = a * x[i] + y[i]; +} + +int main(void) { + setbuf(stderr, NULL); + fprintf(stderr, "Start\n"); + + int rtVersion, driverVersion; + CHECK(cudaRuntimeGetVersion(&rtVersion)); + CHECK(cudaDriverGetVersion(&driverVersion)); + + fprintf(stderr, "Runtime version: %d\n", rtVersion); + fprintf(stderr, "Driver version: %d\n", driverVersion); + + constexpr int N = 1 << 10; + + std::vector xHost(N), yHost(N); + for (int i = 0; i < N; i++) { + xHost[i] = 1.0f; + yHost[i] = 2.0f; + } + + fprintf(stderr, "Host memory initialized, copying to the device\n"); + fflush(stderr); + + float *xDevice, *yDevice; + CHECK(cudaMalloc(&xDevice, N * sizeof(float))); + CHECK(cudaMalloc(&yDevice, N * sizeof(float))); + + CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float), + cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float), + cudaMemcpyHostToDevice)); + fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n"); + + saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice); + fprintf(stderr, "Scheduled a kernel call\n"); + CHECK(cudaGetLastError()); + + CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float), + cudaMemcpyDeviceToHost)); + + float maxError = 0.0f; + for (int i = 0; i < N; i++) + maxError = max(maxError, abs(yHost[i] - 4.0f)); + fprintf(stderr, "Max error: %f\n", maxError); + + CHECK(cudaFree(xDevice)); + CHECK(cudaFree(yDevice)); +} diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix index d10da6843c36b..04f263568ce6f 100644 --- a/pkgs/development/libraries/science/math/magma/generic.nix +++ b/pkgs/development/libraries/science/math/magma/generic.nix @@ -86,29 +86,6 @@ let # "75" -> "750" Cf. https://bitbucket.org/icl/magma/src/f4ec79e2c13a2347eff8a77a3be6f83bc2daec20/CMakeLists.txt#lines-273 "${minArch'}0"; - cuda-common-redist = with cudaPackages; [ - libcublas # cublas_v2.h - libcusparse # cusparse.h - ]; - - # Build-time dependencies - cuda-native-redist = symlinkJoin { - name = "cuda-native-redist-${cudaVersion}"; - paths = with cudaPackages; [ - cuda_cudart # cuda_runtime.h - cuda_nvcc - ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [ - cuda_nvprof # - ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [ - cuda_profiler_api # - ] ++ cuda-common-redist; - }; - - # Run-time dependencies - cuda-redist = symlinkJoin { - name = "cuda-redist-${cudaVersion}"; - paths = cuda-common-redist; - }; in assert (builtins.match "[^[:space:]]*" gpuTargetString) != null; @@ -128,16 +105,22 @@ stdenv.mkDerivation { ninja gfortran ] ++ lists.optionals cudaSupport [ - cuda-native-redist + cudaPackages.cuda_nvcc ]; buildInputs = [ libpthreadstubs lapack blas - ] ++ lists.optionals cudaSupport [ - cuda-redist - ] ++ lists.optionals rocmSupport [ + ] ++ lists.optionals cudaSupport (with cudaPackages; [ + cuda_cudart + libcublas # cublas_v2.h + libcusparse # cusparse.h + ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [ + cuda_nvprof # + ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [ + cuda_profiler_api # + ]) ++ lists.optionals rocmSupport [ hip hipblas hipsparse