NVIDIA · chzblych · Sep 26, 2025 · Sep 25, 2025
@@ -95,7 +95,7 @@ install_rockylinux_requirements() {
         "cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch" \
         "libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.${ARCH1}" \
         "libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.${ARCH1}"; do
-        wget --retry-connrefused --timeout=180 --tries=10 --continue "https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/${ARCH3}/${pkg}.rpm"
+        wget --retry-connrefused --timeout=180 --tries=10 --continue "https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${ARCH3}/${pkg}.rpm"
     done
 
     # Remove old packages

@@ -152,9 +152,9 @@ The following table shows the supported software for TensorRT-LLM.
 * -
   - Software Compatibility
 * - Container
-  - [25.06](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+  - [25.08](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
 * - TensorRT
-  - [10.11](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
+  - [10.13](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
 * - Precision
   -
     - Blackwell (SM100/SM120) - FP32, FP16, BF16, FP8, FP4, INT8, INT4

@@ -2262,7 +2262,7 @@ def launchTestJobs(pipeline, testFilter)
             "B200_PCIe",
             X86_64_TRIPLE,
             false,
-            "dlfw/",
+            "cuda13/",
             DLFW_IMAGE,
             false,
         ],
@@ -2301,7 +2301,7 @@ def launchTestJobs(pipeline, testFilter)
             "GH200",
             AARCH64_TRIPLE,
             false,
-            "dlfw/",
+            "cuda13/",
             DLFW_IMAGE,
             false,
         ],

@@ -4,7 +4,7 @@ accelerate>=1.7.0
 build
 colored
 # cuda-python>=12,<13  # <For CUDA 12.9>
-cuda-python>=12
+cuda-python>=13
 diffusers>=0.27.0
 lark
 mpi4py
@@ -15,23 +15,22 @@ openai
 polygraphy
 psutil
 # nvidia-ml-py>=12,<13  # <For CUDA 12.9>
-nvidia-ml-py>=12
-# Just a wrapper since nvidia-modelopt requires pynvml
-pynvml==12.0.0
+nvidia-ml-py>=13
 pulp
 pandas
 h5py==3.12.1
 StrEnum
 sentencepiece>=0.1.99
-# tensorrt>=10.11.0,<=10.13.0  # <For CUDA 12.9>
+# tensorrt~=10.11.0  # <For CUDA 12.9>
 tensorrt~=10.13.0
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 uses 2.8.0a0.
 # torch>=2.7.1,<=2.8.0a0  # <For CUDA 12.9>
 torch>=2.8.0a0,<=2.8.0
 torchvision
 nvidia-modelopt[torch]~=0.33.0
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 uses 2.27.7
-nvidia-nccl-cu12
+# nvidia-nccl-cu12  # <For CUDA 12.9>
+nvidia-nccl-cu13
 # nvidia-cuda-nvrtc-cu12  # <For CUDA 12.9>
 nvidia-cuda-nvrtc
 transformers==4.56.0

diff --git a/tensorrt_llm/auto_parallel/cluster_info.py b/tensorrt_llm/auto_parallel/cluster_info.py
@@ -176,7 +176,7 @@ class ClusterInfo(DictConversion):
             float32=60,
         ),
     ),
-    # from https://images.nvidia.cn/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
+    # from https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
     "A40":
     ClusterInfo(
         intra_node_bw_per_device=_bandwidths["PCIe-4"],