From f2575a64f70d83f6cd669caf5a2fa9623337cb0a Mon Sep 17 00:00:00 2001 From: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Thu, 9 Oct 2025 17:04:27 +0200 Subject: [PATCH 01/14] Update CI configuration for CUDA --- .github/workflows/ci_cuda.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index fc07f112c7..fe04c61117 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -10,9 +10,9 @@ jobs: group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true runs-on: - group: aws-g4dn-2xlarge + group: aws-g5-2xlarge container: - image: nvidia/cuda:12.3.1-devel-ubuntu22.04 + image: nvidia/cuda:13.0.1-devel-ubuntu24.04 options: --gpus 0 if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }} permissions: From 265a9db47e1013dd849a3e7998bac19cb003feb3 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Thu, 9 Oct 2025 19:16:12 +0200 Subject: [PATCH 02/14] Try g5-4xlarge --- .github/workflows/ci_cuda.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index fe04c61117..1f46583d6f 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -10,10 +10,10 @@ jobs: group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true runs-on: - group: aws-g5-2xlarge + group: aws-g5-4xlarge-cache container: image: nvidia/cuda:13.0.1-devel-ubuntu24.04 - options: --gpus 0 + options: --gpus 0 if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }} permissions: contents: write From 07ffa7fb22d0c7ac700e0d4995900038a49ff671 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Thu, 9 Oct 2025 20:49:44 +0200 Subject: [PATCH 03/14] Revert runner group for now --- .github/workflows/ci_cuda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 1f46583d6f..95c4cb122a 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -10,7 +10,7 @@ jobs: group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true runs-on: - group: aws-g5-4xlarge-cache + group: aws-g4dn-2xlarge container: image: nvidia/cuda:13.0.1-devel-ubuntu24.04 options: --gpus 0 From 3409b6c14a3f321e3f8c48ea21187eece7845827 Mon Sep 17 00:00:00 2001 From: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Mon, 13 Oct 2025 17:48:30 +0200 Subject: [PATCH 04/14] Update ci_cuda.yaml --- .github/workflows/ci_cuda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 95c4cb122a..1f46583d6f 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -10,7 +10,7 @@ jobs: group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true runs-on: - group: aws-g4dn-2xlarge + group: aws-g5-4xlarge-cache container: image: nvidia/cuda:13.0.1-devel-ubuntu24.04 options: --gpus 0 From 2458c93ca73f4eb56fcf211875cca6fa3e2267b3 Mon Sep 17 00:00:00 2001 From: Guillaume LEGENDRE Date: Tue, 14 Oct 2025 11:45:49 +0200 Subject: [PATCH 05/14] Add sleep step before CUDA tests in CI workflow --- .github/workflows/ci_cuda.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 1f46583d6f..b8a67ab746 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -30,5 +30,7 @@ jobs: - name: Install Rust Stable uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2 + - name: sleep + run: sleep 15m - name: Test (cuda) run: cargo test --features cuda From 88496ff1c70928f570a00fb0644cb627f79c1bb0 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Fri, 17 Oct 2025 00:03:10 +0200 Subject: [PATCH 06/14] Attempt to set cuda compute cap --- .github/workflows/ci_cuda.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index b8a67ab746..122e236f12 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -22,6 +22,8 @@ jobs: # with sigstore/fulcio when running outside of PRs. id-token: write security-events: write + env: + CUDA_COMPUTE_CAP: 86 steps: - name: Checkout repository uses: actions/checkout@v3 @@ -30,7 +32,5 @@ jobs: - name: Install Rust Stable uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2 - - name: sleep - run: sleep 15m - name: Test (cuda) run: cargo test --features cuda From 9a6cbb7f481982b4e5d049130e8d0164b13cfacc Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:13:00 +0200 Subject: [PATCH 07/14] Update bindgen_cuda version --- candle-examples/Cargo.toml | 2 +- candle-flash-attn/Cargo.toml | 2 +- candle-kernels/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index ed2035c671..dec10dd519 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -59,7 +59,7 @@ tokio = "1.48.0" [build-dependencies] anyhow = { workspace = true } -bindgen_cuda = { version = "0.1.1", optional = true } +bindgen_cuda = { version = "0.1.5", optional = true } [features] default = [] diff --git a/candle-flash-attn/Cargo.toml b/candle-flash-attn/Cargo.toml index 462d9386a0..338d416770 100644 --- a/candle-flash-attn/Cargo.toml +++ b/candle-flash-attn/Cargo.toml @@ -15,7 +15,7 @@ candle = { path = "../candle-core", features = ["cuda"], package = "candle-core" half = { version = "2.3.1", features = ["num-traits"] } [build-dependencies] -bindgen_cuda = "0.1.1" +bindgen_cuda = "0.1.5" anyhow = { version = "1", features = ["backtrace"] } [dev-dependencies] diff --git a/candle-kernels/Cargo.toml b/candle-kernels/Cargo.toml index 82756d0db9..4a2634dfa9 100644 --- a/candle-kernels/Cargo.toml +++ b/candle-kernels/Cargo.toml @@ -12,4 +12,4 @@ license = "MIT OR Apache-2.0" [dependencies] [build-dependencies] -bindgen_cuda = "0.1.1" +bindgen_cuda = "0.1.5" From 02923718c434824e133c626a38f53fe7d833095d Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:23:22 +0200 Subject: [PATCH 08/14] Try a slightly older image --- .github/workflows/ci_cuda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 122e236f12..20ee507ad1 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -12,7 +12,7 @@ jobs: runs-on: group: aws-g5-4xlarge-cache container: - image: nvidia/cuda:13.0.1-devel-ubuntu24.04 + image: nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04 options: --gpus 0 if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }} permissions: From 73720b8e54368ccd31237adfa37468bb2d9e5606 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Mon, 20 Oct 2025 12:34:33 +0200 Subject: [PATCH 09/14] debugging --- .github/workflows/ci_cuda.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 20ee507ad1..81be1bb746 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -12,7 +12,7 @@ jobs: runs-on: group: aws-g5-4xlarge-cache container: - image: nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04 + image: nvidia/cuda:13.0.1-cudnn-devel-ubuntu24.04 options: --gpus 0 if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }} permissions: @@ -32,5 +32,7 @@ jobs: - name: Install Rust Stable uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2 + - name: sleep + run: sleep 15m - name: Test (cuda) run: cargo test --features cuda From 5c0eda427aeba69d7460cb8bdef76d7b73fc597b Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Mon, 20 Oct 2025 14:39:51 +0200 Subject: [PATCH 10/14] Update actions/checkout --- .github/workflows/ci_cuda.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 81be1bb746..9884874cbd 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -26,13 +26,11 @@ jobs: CUDA_COMPUTE_CAP: 86 steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v5 - name: Install dependencies - run: apt-get update && apt install curl build-essential libssl-dev protobuf-compiler pkg-config -y + run: apt update && apt install curl build-essential libssl-dev protobuf-compiler pkg-config -y - name: Install Rust Stable uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2 - - name: sleep - run: sleep 15m - name: Test (cuda) run: cargo test --features cuda From bd48fba8d87f07f7220e895480c86c1f8c398575 Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Mon, 20 Oct 2025 14:44:55 +0200 Subject: [PATCH 11/14] debugging --- .github/workflows/ci_cuda.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 9884874cbd..ddbcbd9066 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -32,5 +32,9 @@ jobs: - name: Install Rust Stable uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2 + - name: LD_LIBRARY_PATH + run: echo $LD_LIBRARY_PATH + - name: locate lib + run: find /usr/ -name 'libcuda.so.*' - name: Test (cuda) run: cargo test --features cuda From f4e5394c4010572c73c7e89616da4e62ad3f8e0a Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Mon, 20 Oct 2025 14:51:07 +0200 Subject: [PATCH 12/14] Manually add shared cuda library to LD_LIBRARY_PATH --- .github/workflows/ci_cuda.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index ddbcbd9066..3b19933dcf 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -32,9 +32,7 @@ jobs: - name: Install Rust Stable uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2 - - name: LD_LIBRARY_PATH - run: echo $LD_LIBRARY_PATH - - name: locate lib - run: find /usr/ -name 'libcuda.so.*' + - name: Add libcuda.so to LD_LIBRARY_PATH + run: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-13.0/compat/libcuda.so.1 - name: Test (cuda) run: cargo test --features cuda From 987e0ef9a39c7fdd26becb341b53a5803c7c6f8d Mon Sep 17 00:00:00 2001 From: Guillaume LEGENDRE Date: Wed, 22 Oct 2025 14:53:51 +0200 Subject: [PATCH 13/14] Change CI runner group to aws-g5-4xlarge-cache-k8s --- .github/workflows/ci_cuda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 3b19933dcf..68cd1558d5 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -10,7 +10,7 @@ jobs: group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true runs-on: - group: aws-g5-4xlarge-cache + group: aws-g5-4xlarge-cache-k8s container: image: nvidia/cuda:13.0.1-cudnn-devel-ubuntu24.04 options: --gpus 0 From fce33316f19daec0701bb6b79c10f0ab5eeb771c Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:28:51 +0200 Subject: [PATCH 14/14] Remove manual LD_LIBRARY_PATH update --- .github/workflows/ci_cuda.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml index 68cd1558d5..06f1c68af6 100644 --- a/.github/workflows/ci_cuda.yaml +++ b/.github/workflows/ci_cuda.yaml @@ -32,7 +32,5 @@ jobs: - name: Install Rust Stable uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2 - - name: Add libcuda.so to LD_LIBRARY_PATH - run: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-13.0/compat/libcuda.so.1 - name: Test (cuda) run: cargo test --features cuda