diff --git a/scripts/ci/cuda/cache_nvidia_wheels.sh b/scripts/ci/cuda/cache_nvidia_wheels.sh
new file mode 100755
index 000000000000..2a0f8dbb9e65
--- /dev/null
+++ b/scripts/ci/cuda/cache_nvidia_wheels.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Cache and pre-install nvidia wheels that torch pins.
+#
+# pypi.nvidia.com returns Cache-Control: no-store, so pip re-downloads
+# cudnn (~707 MB) and nvshmem (~125 MB) on every CI run. This script
+# caches the wheels locally and installs them so that the subsequent
+# `pip install -e "python[dev]"` sees "Requirement already satisfied".
+#
+# Integrity: uses `unzip -t` to detect partial/corrupt downloads.
+#
+# Usage: source scripts/ci/cuda/cache_nvidia_wheels.sh
+
+NVIDIA_WHEEL_CACHE="/root/.cache/nvidia-wheels"
+mkdir -p "$NVIDIA_WHEEL_CACHE"
+
+for url in \
+    "https://pypi.nvidia.com/nvidia-cudnn-cu12/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl" \
+    "https://pypi.nvidia.com/nvidia-nvshmem-cu12/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl"; do
+    whl="$NVIDIA_WHEEL_CACHE/$(basename "$url")"
+    [ -f "$whl" ] && unzip -tq "$whl" &>/dev/null || curl -fL -o "$whl" "$url"
+done
+
+pip install --no-deps "$NVIDIA_WHEEL_CACHE"/nvidia_cudnn_cu12-*.whl \
+    "$NVIDIA_WHEEL_CACHE"/nvidia_nvshmem_cu12-*.whl 2>/dev/null || true
diff --git a/scripts/ci/cuda/ci_install_dependency.sh b/scripts/ci/cuda/ci_install_dependency.sh
index a0bf776ae7b6..d30ae7e26ea9 100755
--- a/scripts/ci/cuda/ci_install_dependency.sh
+++ b/scripts/ci/cuda/ci_install_dependency.sh
@@ -24,6 +24,11 @@ set -euxo pipefail
 # ------------------------------------------------------------------------------
 # Set up environment variables
 CU_VERSION="cu129"
+
+# Nvidia package versions we override (torch pins older versions).
+# Used both as pip constraints during install and for post-install verification.
+NVIDIA_CUDNN_VERSION="9.16.0.29"
+NVIDIA_NVSHMEM_VERSION="3.4.5"
 OPTIONAL_DEPS="${1:-}"
 
 SECONDS=0
@@ -218,6 +223,7 @@ if [ -n "$OPTIONAL_DEPS" ]; then
     EXTRAS="dev,${OPTIONAL_DEPS}"
 fi
 echo "Installing python extras: [${EXTRAS}]"
+source "$(dirname "$0")/cache_nvidia_wheels.sh"
 $PIP_CMD install -e "python[${EXTRAS}]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX
 
 mark_step_done "Install main package"
@@ -331,18 +337,18 @@ fi
 
 # Fix dependencies: DeepEP depends on nvshmem 3.4.5 — skip reinstall when already correct (avoids pip races / wasted work)
 INSTALLED_NVSHMEM=$(pip show nvidia-nvshmem-cu12 2>/dev/null | grep "^Version:" | awk '{print $2}' || echo "")
-if [ "$INSTALLED_NVSHMEM" = "3.4.5" ]; then
-    echo "nvidia-nvshmem-cu12==3.4.5 already installed, skipping reinstall"
+if [ "$INSTALLED_NVSHMEM" = "$NVIDIA_NVSHMEM_VERSION" ]; then
+    echo "nvidia-nvshmem-cu12==${NVIDIA_NVSHMEM_VERSION} already installed, skipping reinstall"
 else
-    $PIP_CMD install nvidia-nvshmem-cu12==3.4.5 $PIP_INSTALL_SUFFIX
+    $PIP_CMD install nvidia-nvshmem-cu12==${NVIDIA_NVSHMEM_VERSION} $PIP_INSTALL_SUFFIX
 fi
 
 # Fix dependencies: Cudnn with version less than 9.16.0.29 will cause performance regression on Conv3D kernel
 INSTALLED_CUDNN=$(pip show nvidia-cudnn-cu12 2>/dev/null | grep "^Version:" | awk '{print $2}' || echo "")
-if [ "$INSTALLED_CUDNN" = "9.16.0.29" ]; then
-    echo "nvidia-cudnn-cu12==9.16.0.29 already installed, skipping reinstall"
+if [ "$INSTALLED_CUDNN" = "$NVIDIA_CUDNN_VERSION" ]; then
+    echo "nvidia-cudnn-cu12==${NVIDIA_CUDNN_VERSION} already installed, skipping reinstall"
 else
-    $PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 $PIP_INSTALL_SUFFIX
+    $PIP_CMD install nvidia-cudnn-cu12==${NVIDIA_CUDNN_VERSION} $PIP_INSTALL_SUFFIX
 fi
 
 mark_step_done "Fix other dependencies"