Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions scripts/ci/cuda/cache_nvidia_wheels.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
# Cache and pre-install nvidia wheels that torch pins.
#
# pypi.nvidia.com returns Cache-Control: no-store, so pip re-downloads
# cudnn (~707 MB) and nvshmem (~125 MB) on every CI run. This script
# caches the wheels locally and installs them so that the subsequent
# `pip install -e "python[dev]"` sees "Requirement already satisfied".
#
# Integrity: uses `unzip -t` to detect partial/corrupt downloads.
#
# Usage: source scripts/ci/cuda/cache_nvidia_wheels.sh

NVIDIA_WHEEL_CACHE="/root/.cache/nvidia-wheels"
mkdir -p "$NVIDIA_WHEEL_CACHE"

for url in \
"https://pypi.nvidia.com/nvidia-cudnn-cu12/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl" \
"https://pypi.nvidia.com/nvidia-nvshmem-cu12/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl"; do
whl="$NVIDIA_WHEEL_CACHE/$(basename "$url")"
[ -f "$whl" ] && unzip -tq "$whl" &>/dev/null || curl -fL -o "$whl" "$url"
done

pip install --no-deps "$NVIDIA_WHEEL_CACHE"/nvidia_cudnn_cu12-*.whl \
"$NVIDIA_WHEEL_CACHE"/nvidia_nvshmem_cu12-*.whl 2>/dev/null || true
18 changes: 12 additions & 6 deletions scripts/ci/cuda/ci_install_dependency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ set -euxo pipefail
# ------------------------------------------------------------------------------
# Set up environment variables
CU_VERSION="cu129"

# Nvidia package versions we override (torch pins older versions).
# Used both as pip constraints during install and for post-install verification.
NVIDIA_CUDNN_VERSION="9.16.0.29"
NVIDIA_NVSHMEM_VERSION="3.4.5"
OPTIONAL_DEPS="${1:-}"

SECONDS=0
Expand Down Expand Up @@ -218,6 +223,7 @@ if [ -n "$OPTIONAL_DEPS" ]; then
EXTRAS="dev,${OPTIONAL_DEPS}"
fi
echo "Installing python extras: [${EXTRAS}]"
source "$(dirname "$0")/cache_nvidia_wheels.sh"
$PIP_CMD install -e "python[${EXTRAS}]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX

mark_step_done "Install main package"
Expand Down Expand Up @@ -331,18 +337,18 @@ fi

# Fix dependencies: DeepEP depends on nvshmem 3.4.5 — skip reinstall when already correct (avoids pip races / wasted work)
INSTALLED_NVSHMEM=$(pip show nvidia-nvshmem-cu12 2>/dev/null | grep "^Version:" | awk '{print $2}' || echo "")
if [ "$INSTALLED_NVSHMEM" = "3.4.5" ]; then
echo "nvidia-nvshmem-cu12==3.4.5 already installed, skipping reinstall"
if [ "$INSTALLED_NVSHMEM" = "$NVIDIA_NVSHMEM_VERSION" ]; then
echo "nvidia-nvshmem-cu12==${NVIDIA_NVSHMEM_VERSION} already installed, skipping reinstall"
else
$PIP_CMD install nvidia-nvshmem-cu12==3.4.5 $PIP_INSTALL_SUFFIX
$PIP_CMD install nvidia-nvshmem-cu12==${NVIDIA_NVSHMEM_VERSION} $PIP_INSTALL_SUFFIX
fi

# Fix dependencies: Cudnn with version less than 9.16.0.29 will cause performance regression on Conv3D kernel
INSTALLED_CUDNN=$(pip show nvidia-cudnn-cu12 2>/dev/null | grep "^Version:" | awk '{print $2}' || echo "")
if [ "$INSTALLED_CUDNN" = "9.16.0.29" ]; then
echo "nvidia-cudnn-cu12==9.16.0.29 already installed, skipping reinstall"
if [ "$INSTALLED_CUDNN" = "$NVIDIA_CUDNN_VERSION" ]; then
echo "nvidia-cudnn-cu12==${NVIDIA_CUDNN_VERSION} already installed, skipping reinstall"
else
$PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 $PIP_INSTALL_SUFFIX
$PIP_CMD install nvidia-cudnn-cu12==${NVIDIA_CUDNN_VERSION} $PIP_INSTALL_SUFFIX
fi

mark_step_done "Fix other dependencies"
Expand Down
Loading