diff --git a/scripts/ci/ci_install_deepep.sh b/scripts/ci/ci_install_deepep.sh index 2f52643056bd..92258c19f618 100755 --- a/scripts/ci/ci_install_deepep.sh +++ b/scripts/ci/ci_install_deepep.sh @@ -70,7 +70,13 @@ if [ "$GRACE_BLACKWELL" = "1" ]; then if [ "$CUDA_VERSION" = "12.8" ]; then CHOSEN_TORCH_CUDA_ARCH_LIST='10.0' elif awk -v ver="$CUDA_VERSION" 'BEGIN {exit !(ver > 12.8)}'; then - CHOSEN_TORCH_CUDA_ARCH_LIST='10.0;10.3' + # With cuda > 12.8, the compiler supports 10.3, so we should use + # CHOSEN_TORCH_CUDA_ARCH_LIST='10.0;10.3' + # + # However, our CI machine has a weird setup and nvidia-smi reports wrong CUDA version in the container. + # The container is actually cuda 12.8, but nvidia-smi reports 13.0, leading to compilation errors. so we + # drop 10.3. + CHOSEN_TORCH_CUDA_ARCH_LIST='10.0' else echo "Unsupported CUDA version for Grace Blackwell: $CUDA_VERSION" && exit 1 fi && \