diff --git a/scripts/ci/cuda/cache_nvidia_wheels.sh b/scripts/ci/cuda/cache_nvidia_wheels.sh new file mode 100755 index 000000000000..2a0f8dbb9e65 --- /dev/null +++ b/scripts/ci/cuda/cache_nvidia_wheels.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Cache and pre-install nvidia wheels that torch pins. +# +# pypi.nvidia.com returns Cache-Control: no-store, so pip re-downloads +# cudnn (~707 MB) and nvshmem (~125 MB) on every CI run. This script +# caches the wheels locally and installs them so that the subsequent +# `pip install -e "python[dev]"` sees "Requirement already satisfied". +# +# Integrity: uses `unzip -t` to detect partial/corrupt downloads. +# +# Usage: source scripts/ci/cuda/cache_nvidia_wheels.sh + +NVIDIA_WHEEL_CACHE="/root/.cache/nvidia-wheels" +mkdir -p "$NVIDIA_WHEEL_CACHE" + +for url in \ + "https://pypi.nvidia.com/nvidia-cudnn-cu12/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl" \ + "https://pypi.nvidia.com/nvidia-nvshmem-cu12/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl"; do + whl="$NVIDIA_WHEEL_CACHE/$(basename "$url")" + [ -f "$whl" ] && unzip -tq "$whl" &>/dev/null || curl -fL -o "$whl" "$url" +done + +pip install --no-deps "$NVIDIA_WHEEL_CACHE"/nvidia_cudnn_cu12-*.whl \ + "$NVIDIA_WHEEL_CACHE"/nvidia_nvshmem_cu12-*.whl 2>/dev/null || true diff --git a/scripts/ci/cuda/ci_install_dependency.sh b/scripts/ci/cuda/ci_install_dependency.sh index a0bf776ae7b6..d30ae7e26ea9 100755 --- a/scripts/ci/cuda/ci_install_dependency.sh +++ b/scripts/ci/cuda/ci_install_dependency.sh @@ -24,6 +24,11 @@ set -euxo pipefail # ------------------------------------------------------------------------------ # Set up environment variables CU_VERSION="cu129" + +# Nvidia package versions we override (torch pins older versions). +# Used both as pip constraints during install and for post-install verification. +NVIDIA_CUDNN_VERSION="9.16.0.29" +NVIDIA_NVSHMEM_VERSION="3.4.5" OPTIONAL_DEPS="${1:-}" SECONDS=0 @@ -218,6 +223,7 @@ if [ -n "$OPTIONAL_DEPS" ]; then EXTRAS="dev,${OPTIONAL_DEPS}" fi echo "Installing python extras: [${EXTRAS}]" +source "$(dirname "$0")/cache_nvidia_wheels.sh" $PIP_CMD install -e "python[${EXTRAS}]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX mark_step_done "Install main package" @@ -331,18 +337,18 @@ fi # Fix dependencies: DeepEP depends on nvshmem 3.4.5 — skip reinstall when already correct (avoids pip races / wasted work) INSTALLED_NVSHMEM=$(pip show nvidia-nvshmem-cu12 2>/dev/null | grep "^Version:" | awk '{print $2}' || echo "") -if [ "$INSTALLED_NVSHMEM" = "3.4.5" ]; then - echo "nvidia-nvshmem-cu12==3.4.5 already installed, skipping reinstall" +if [ "$INSTALLED_NVSHMEM" = "$NVIDIA_NVSHMEM_VERSION" ]; then + echo "nvidia-nvshmem-cu12==${NVIDIA_NVSHMEM_VERSION} already installed, skipping reinstall" else - $PIP_CMD install nvidia-nvshmem-cu12==3.4.5 $PIP_INSTALL_SUFFIX + $PIP_CMD install nvidia-nvshmem-cu12==${NVIDIA_NVSHMEM_VERSION} $PIP_INSTALL_SUFFIX fi # Fix dependencies: Cudnn with version less than 9.16.0.29 will cause performance regression on Conv3D kernel INSTALLED_CUDNN=$(pip show nvidia-cudnn-cu12 2>/dev/null | grep "^Version:" | awk '{print $2}' || echo "") -if [ "$INSTALLED_CUDNN" = "9.16.0.29" ]; then - echo "nvidia-cudnn-cu12==9.16.0.29 already installed, skipping reinstall" +if [ "$INSTALLED_CUDNN" = "$NVIDIA_CUDNN_VERSION" ]; then + echo "nvidia-cudnn-cu12==${NVIDIA_CUDNN_VERSION} already installed, skipping reinstall" else - $PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 $PIP_INSTALL_SUFFIX + $PIP_CMD install nvidia-cudnn-cu12==${NVIDIA_CUDNN_VERSION} $PIP_INSTALL_SUFFIX fi mark_step_done "Fix other dependencies"