Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 6 additions & 12 deletions pkgs/development/compilers/cudatoolkit/common.nix
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@ args@
, libkrb5
, krb5
, makeWrapper
, markForCudatoolkitRootHook
, ncurses5
, numactl
, nss
, perl
, python3 # FIXME: CUDAToolkit 10 may still need python27
, pulseaudio
, requireFile
, setupCudaHook
, stdenv
, backendStdenv # E.g. gcc11Stdenv, set in extension.nix
, unixODBC
Expand Down Expand Up @@ -80,11 +82,15 @@ backendStdenv.mkDerivation rec {
addOpenGLRunpath
autoPatchelfHook
autoAddOpenGLRunpathHook
markForCudatoolkitRootHook
] ++ lib.optionals (lib.versionOlder version "11") [
libsForQt5.wrapQtAppsHook
] ++ lib.optionals (lib.versionAtLeast version "11.8") [
qt6Packages.wrapQtAppsHook
];
depsTargetTargetPropagated = [
setupCudaHook
];
buildInputs = lib.optionals (lib.versionOlder version "11") [
libsForQt5.qt5.qtwebengine
freeglut
Expand Down Expand Up @@ -280,24 +286,12 @@ backendStdenv.mkDerivation rec {
sed -i "1 i#define _BITS_FLOATN_H" "$out/include/host_defines.h"
'' +
# Point NVCC at a compatible compiler
# FIXME: redist cuda_nvcc copy-pastes this code
# Refer to comments in the overrides for cuda_nvcc for explanation
# CUDA_TOOLKIT_ROOT_DIR is legacy,
# Cf. https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
# NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the compiled
# binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
# the default set of CUDA capabilities we build can regularly cause this to occur (for
# example, with Magma).
''
mkdir -p $out/nix-support
cat <<EOF >> $out/nix-support/setup-hook
cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
cmakeFlags+=' -DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
if [ -z "\''${CUDAHOSTCXX-}" ]; then
export CUDAHOSTCXX=${backendStdenv.cc}/bin;
fi
export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${backendStdenv.cc}/bin -Xfatbin=-compress-all'
EOF

# Move some libraries to the lib output so that programs that
Expand Down
31 changes: 30 additions & 1 deletion pkgs/development/compilers/cudatoolkit/extension.nix
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,39 @@ final: prev: let

cudaFlags = final.callPackage ./flags.nix {};

# Internal hook, used by cudatoolkit and cuda redist packages
# to accommodate automatic CUDAToolkit_ROOT construction
markForCudatoolkitRootHook = (final.callPackage
({ makeSetupHook }:
makeSetupHook
{ name = "mark-for-cudatoolkit-root-hook"; }
./hooks/mark-for-cudatoolkit-root-hook.sh)
{ });

# Normally propagated by cuda_nvcc or cudatoolkit through their depsHostHostPropagated
setupCudaHook = (final.callPackage
({ makeSetupHook, backendStdenv }:
makeSetupHook
{
name = "setup-cuda-hook";

substitutions.ccRoot = "${backendStdenv.cc}";

# Required in addition to ccRoot as otherwise bin/gcc is looked up
# when building CMakeCUDACompilerId.cu
substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
}
./hooks/setup-cuda-hook.sh)
{ });

in
{
inherit
backendStdenv
cudatoolkit
cudaFlags;
cudaFlags
markForCudatoolkitRootHook
setupCudaHook;

saxpy = final.callPackage ./saxpy { };
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# shellcheck shell=bash

markForCUDAToolkit_ROOT() {
mkdir -p "${prefix}/nix-support"
touch "${prefix}/nix-support/include-in-cudatoolkit-root"
}

fixupOutputHooks+=(markForCUDAToolkit_ROOT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# shellcheck shell=bash

# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
# CUDAToolkit_ROOT. We have to help it locate libcudart
export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include"
68 changes: 68 additions & 0 deletions pkgs/development/compilers/cudatoolkit/hooks/setup-cuda-hook.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# shellcheck shell=bash

echo Sourcing setup-cuda-hook >&2

extendCUDAToolkit_ROOT() {
if [[ -f "$1/nix-support/include-in-cudatoolkit-root" ]] ; then
addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$1"

if [[ -d "$1/include" ]] ; then
addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$1/include"
fi
fi
}

addEnvHooks "$targetOffset" extendCUDAToolkit_ROOT

setupCUDAToolkitCompilers() {
echo Executing setupCUDAToolkitCompilers >&2

if [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] ; then
return
fi

# Point NVCC at a compatible compiler

# For CMake-based projects:
# https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
# https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
# https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html

export cmakeFlags+=" -DCUDA_HOST_COMPILER=@ccFullPath@"
export cmakeFlags+=" -DCMAKE_CUDA_HOST_COMPILER=@ccFullPath@"

# For non-CMake projects:
# We prepend --compiler-bindir to nvcc flags.
# Downstream packages can override these, because NVCC
# uses the last --compiler-bindir it gets on the command line.
# FIXME: this results in "incompatible redefinition" warnings.
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
if [ -z "${CUDAHOSTCXX-}" ]; then
export CUDAHOSTCXX="@ccFullPath@";
fi

export NVCC_PREPEND_FLAGS+=" --compiler-bindir=@ccRoot@/bin"

# NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled
# binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
# the default set of CUDA capabilities we build can regularly cause this to occur (for
# example, with Magma).
#
# @SomeoneSerge: original comment was made by @ConnorBaker in .../cudatoolkit/common.nix
if [[ -z "${dontCompressFatbin-}" ]]; then
export NVCC_PREPEND_FLAGS+=" -Xfatbin=-compress-all"
fi

# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
# CUDAToolkit_ROOT. We have to help it locate libcudart
if [[ -z "${nvccDontPrependCudartFlags-}" ]] ; then
export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include"
fi
}

setupCMakeCUDAToolkit_ROOT() {
export cmakeFlags+=" -DCUDAToolkit_INCLUDE_DIR=$CUDAToolkit_INCLUDE_DIR -DCUDAToolkit_ROOT=$CUDAToolkit_ROOT"
}

postHooks+=(setupCUDAToolkitCompilers)
preConfigureHooks+=(setupCMakeCUDAToolkit_ROOT)
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
, fetchurl
, autoPatchelfHook
, autoAddOpenGLRunpathHook
, markForCudatoolkitRootHook
}:

pname:
Expand All @@ -28,6 +29,7 @@ backendStdenv.mkDerivation {
# directory to the rpath of all ELF binaries.
# Check e.g. with `patchelf --print-rpath path/to/my/binary
autoAddOpenGLRunpathHook
markForCudatoolkitRootHook
];

buildInputs = [
Expand Down
46 changes: 19 additions & 27 deletions pkgs/development/compilers/cudatoolkit/redist/overrides.nix
Original file line number Diff line number Diff line change
Expand Up @@ -27,35 +27,27 @@ in
inherit (prev.backendStdenv) cc;
in
{
# Point NVCC at a compatible compiler
# FIXME: non-redist cudatoolkit copy-pastes this code
# Required by cmake's enable_language(CUDA) to build a test program
# When implementing cross-compilation support: this is
# final.pkgs.targetPackages.cudaPackages.cuda_cudart
env.cudartRoot = "${prev.lib.getDev final.cuda_cudart}";

# For CMake-based projects:
# https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
# https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
# https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
# Point NVCC at a compatible compiler

# For non-CMake projects:
# We prepend --compiler-bindir to nvcc flags.
# Downstream packages can override these, because NVCC
# uses the last --compiler-bindir it gets on the command line.
# FIXME: this results in "incompatible redefinition" warnings.
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
# NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the
# compiled binaries. If binaries grow over 2GB, they will fail to link. This is a problem
# for us, as the default set of CUDA capabilities we build can regularly cause this to
# occur (for example, with Magma).
postInstall = (oldAttrs.postInstall or "") + ''
mkdir -p $out/nix-support
cat <<EOF >> $out/nix-support/setup-hook
cmakeFlags+=' -DCUDA_HOST_COMPILER=${cc}/bin'
cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${cc}/bin'
if [ -z "\''${CUDAHOSTCXX-}" ]; then
export CUDAHOSTCXX=${cc}/bin;
fi
export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${cc}/bin -Xfatbin=-compress-all'
EOF
'';
# Desiredata: whenever a package (e.g. magma) adds cuda_nvcc to
# nativeBuildInputs (offsets `(-1, 0)`), magma should also source the
# setupCudaHook, i.e. we want it the hook to be propagated into the
# same nativeBuildInputs.
#
# Logically, cuda_nvcc should include the hook in depsHostHostPropagated,
# so that the final offsets for the propagated hook would be `(-1, 0) +
# (0, 0) = (-1, 0)`.
#
# In practice, TargetTarget appears to work:
# https://gist.github.com/fd80ff142cd25e64603618a3700e7f82
depsTargetTargetPropagated = [
final.setupCudaHook
];
});

cuda_nvprof = prev.cuda_nvprof.overrideAttrs (oldAttrs: {
Expand Down
12 changes: 12 additions & 0 deletions pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
cmake_minimum_required(VERSION 3.25)
project(saxpy LANGUAGES CXX CUDA)

find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas)

add_executable(saxpy saxpy.cu)
target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m)
target_compile_features(saxpy PRIVATE cxx_std_14)
target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--expt-relaxed-constexpr>)

install(TARGETS saxpy)
50 changes: 50 additions & 0 deletions pkgs/development/compilers/cudatoolkit/saxpy/default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{ autoAddOpenGLRunpathHook
, backendStdenv
, cmake
, cuda_cccl
, cuda_cudart
, cudaFlags
, cuda_nvcc
, lib
, libcublas
, setupCudaHook
, stdenv
}:

backendStdenv.mkDerivation {
pname = "saxpy";
version = "unstable-2023-07-11";

src = ./.;

buildInputs = [
libcublas
cuda_cudart
cuda_cccl
];
nativeBuildInputs = [
cmake

# NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for
# cross-compilation to work. This should work automatically once we move to
# spliced scopes. Delete this comment once that happens
cuda_nvcc

# Alternatively, we could remove the propagated hook from cuda_nvcc and add
# directly:
# setupCudaHook
autoAddOpenGLRunpathHook
];

cmakeFlags = [
"-DCMAKE_VERBOSE_MAKEFILE=ON"
"-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}"
];

meta = {
description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation";
license = lib.licenses.mit;
maintainers = lib.teams.cuda.members;
platforms = lib.platforms.unix;
};
}
68 changes: 68 additions & 0 deletions pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <vector>

#include <stdio.h>

static inline void check(cudaError_t err, const char *context) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
std::exit(EXIT_FAILURE);
}
}

#define CHECK(x) check(x, #x)

__global__ void saxpy(int n, float a, float *x, float *y) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}

int main(void) {
setbuf(stderr, NULL);
fprintf(stderr, "Start\n");

int rtVersion, driverVersion;
CHECK(cudaRuntimeGetVersion(&rtVersion));
CHECK(cudaDriverGetVersion(&driverVersion));

fprintf(stderr, "Runtime version: %d\n", rtVersion);
fprintf(stderr, "Driver version: %d\n", driverVersion);

constexpr int N = 1 << 10;

std::vector<float> xHost(N), yHost(N);
for (int i = 0; i < N; i++) {
xHost[i] = 1.0f;
yHost[i] = 2.0f;
}

fprintf(stderr, "Host memory initialized, copying to the device\n");
fflush(stderr);

float *xDevice, *yDevice;
CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
CHECK(cudaMalloc(&yDevice, N * sizeof(float)));

CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
cudaMemcpyHostToDevice));
fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");

saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
fprintf(stderr, "Scheduled a kernel call\n");
CHECK(cudaGetLastError());

CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
cudaMemcpyDeviceToHost));

float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(yHost[i] - 4.0f));
fprintf(stderr, "Max error: %f\n", maxError);

CHECK(cudaFree(xDevice));
CHECK(cudaFree(yDevice));
}
Loading