diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml index b24fe8c81b7f..250cb9c5b43e 100644 --- a/.github/workflows/execute-notebook.yml +++ b/.github/workflows/execute-notebook.yml @@ -34,9 +34,28 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Build sgl-kernel locally for PyTorch 2.10 compatibility + timeout-minutes: 30 + run: | + # Install CMake 3.26+ required by sgl-kernel + pip install "cmake>=3.26" + cmake --version + cd sgl-kernel + pip install scikit-build-core>=0.10 ninja wheel numpy uv + pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu129 + make build + # Rename wheel to match expected pattern (linux_x86_64 -> manylinux2014_x86_64) + for whl in dist/sgl_kernel-*-linux_x86_64.whl; do + if [ -f "$whl" ]; then + newname=$(echo "$whl" | sed 's/linux_x86_64/manylinux2014_x86_64/') + mv "$whl" "$newname" + echo "Renamed $whl to $newname" + fi + done + - name: Install dependencies run: | - bash scripts/ci/cuda/ci_install_dependency.sh + CUSTOM_BUILD_SGL_KERNEL=true bash scripts/ci/cuda/ci_install_dependency.sh pip install -r docs/requirements.txt apt-get update && apt-get install -y pandoc parallel retry ln -sf "$(which python3)" /usr/bin/python diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index d77c0dc6087b..a9dfa538538b 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -804,13 +804,15 @@ jobs: # =============================================== jit-kernel ==================================================== jit-kernel-unit-test: - needs: [check-changes, call-gate] + needs: [check-changes, call-gate, sgl-kernel-build-wheels] # Skip for scheduled runs and when target_stage is set if: | + always() && github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && - needs.check-changes.outputs.jit_kernel == 'true' + needs.check-changes.outputs.jit_kernel == 'true' && + (needs.sgl-kernel-build-wheels.result == 'success' || needs.sgl-kernel-build-wheels.result == 'skipped') runs-on: 1-gpu-runner timeout-minutes: 240 env: @@ -820,10 +822,43 @@ jobs: with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + - name: Clean up disk + run: | + ls -alh sgl-kernel/dist || true + rm -rf sgl-kernel/dist/* || true + + - name: Download artifacts + if: needs.check-changes.outputs.sgl_kernel == 'true' + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-python3.10-cuda12.9 + + - name: Build sgl-kernel locally if wheel not available + if: needs.check-changes.outputs.sgl_kernel != 'true' + timeout-minutes: 30 + run: | + # Install CMake 3.26+ required by sgl-kernel + pip install "cmake>=3.26" + cmake --version + cd sgl-kernel + pip install scikit-build-core>=0.10 ninja wheel numpy uv + pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu129 + make build + # Rename wheel to match expected pattern (linux_x86_64 -> manylinux2014_x86_64) + for whl in dist/sgl_kernel-*-linux_x86_64.whl; do + if [ -f "$whl" ]; then + newname=$(echo "$whl" | sed 's/linux_x86_64/manylinux2014_x86_64/') + mv "$whl" "$newname" + echo "Renamed $whl to $newname" + fi + done + - name: Install dependencies timeout-minutes: 20 run: | - bash scripts/ci/cuda/ci_install_dependency.sh + CUSTOM_BUILD_SGL_KERNEL=true bash scripts/ci/cuda/ci_install_dependency.sh - name: Run test timeout-minutes: 30 diff --git a/python/pyproject.toml b/python/pyproject.toml index 9b2613ddb793..909aab0d944d 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -65,10 +65,10 @@ dependencies = [ "tiktoken", "timm==1.0.16", "torch_memory_saver==0.0.9", - "torch==2.9.1", + "torch==2.10.0", "torchao==0.9.0", - "torchaudio==2.9.1", - "torchcodec==0.9.1 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec 0.9.1 for torch 2.9.x. Not available on Linux ARM. + "torchaudio==2.10.0", + "torchcodec==0.9.1 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", "av ; sys_platform == 'linux' and (platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'armv7l')", "torchvision", "tqdm", diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index e1177cfcd11b..c7d3b0cc7c2f 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -11,6 +11,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +# NOTE: PyTorch 2.10.0 still requires cuDNN >= 9.15 for Conv3D operations. +# The cuDNN version check was removed from runtime but CI enforces +# nvidia-cudnn-cu12==9.16.0.29 installation to avoid performance regression. """The arguments of the server.""" from __future__ import annotations @@ -5718,14 +5721,12 @@ def check_server_args(self): f"Invalid value: '{self.served_model_name}'" ) + # Check cuDNN compatibility for PyTorch 2.10+ + self.check_torch_cudnn_compatibility() + # Check LoRA self.check_lora_server_args() - # torch 2.9.1 has compatibility issues with cuDNN 9.14 and below, - # causing extremely slow nn.Conv3d performance. - # TODO(yhyang201): Remove this check when sglang no longer uses torch 2.9.1. - self.check_torch_2_9_1_cudnn_compatibility() - # Check speculative decoding if self.speculative_algorithm is not None: assert ( @@ -5834,48 +5835,61 @@ def check_server_args(self): "When enabling two batch overlap, moe_a2a_backend cannot be 'none'." ) - def check_torch_2_9_1_cudnn_compatibility(self): + def check_torch_cudnn_compatibility(self): + """Check cuDNN compatibility for PyTorch 2.10+. + + PyTorch 2.10.0 ships with cuDNN 9.10.2.21 which has Conv3D performance regression. + This check warns users to upgrade to cuDNN 9.15+ for multimodal models. + """ if get_bool_env_var("SGLANG_DISABLE_CUDNN_CHECK"): return - if self.get_model_config().is_multimodal: - import torch + # Only check for multimodal models which use Conv3D + if ( + not hasattr(self, "get_model_config") + or not self.get_model_config().is_multimodal + ): + return + + import torch - if torch_release[:3] == (2, 9, 1): + torch_version = torch.__version__.split("+", 1)[0] + # Check for PyTorch 2.10.0+ (which has the cuDNN issue) + if torch_version.startswith("2.10"): + cudnn_version = None + try: + cudnn_version = torch.backends.cudnn.version() + except Exception: cudnn_version = None - try: - cudnn_version = torch.backends.cudnn.version() - except Exception: - cudnn_version = None - if cudnn_version is not None: - version_float = float(str(cudnn_version)[:3]) / 100 - if version_float < 9.15: - RED = "\033[91m" - BOLD = "\033[1m" - RESET = "\033[0m" - msg = ( - f"{RED}{BOLD}" - "CRITICAL WARNING: PyTorch 2.9.1 & CuDNN Compatibility Issue Detected\n" - "--------------------------------------------------------------------------------\n" - f"Current Environment: PyTorch {torch.__version__} | CuDNN {version_float:.2f}\n\n" - "Issue: There is a KNOWN BUG in PyTorch 2.9.1's `nn.Conv3d` implementation\n" - " when used with CuDNN versions older than 9.15. This can cause\n" - " SEVERE PERFORMANCE DEGRADATION and EXCESSIVE MEMORY USAGE.\n\n" - "Reference: https://github.com/pytorch/pytorch/issues/168167\n\n" - "Solution: You MUST upgrade CuDNN to version 9.15+ to ensure correctness.\n\n" - "Run the following command immediately to fix:\n" - " pip install nvidia-cudnn-cu12==9.16.0.29\n\n" - "Or you can disable this check by setting env var SGLANG_DISABLE_CUDNN_CHECK=1\n" - "--------------------------------------------------------------------------------\n" - f"{RESET}" - ) - raise RuntimeError(msg) - else: + if cudnn_version is not None: + version_float = float(str(cudnn_version)[:3]) / 100 + if version_float < 9.15: RED = "\033[91m" + BOLD = "\033[1m" RESET = "\033[0m" - logger.warning( - f"{RED}WARNING: Could not determine CuDNN version for torch==2.9.1. Please ensure CuDNN >= 9.15 to avoid nn.Conv3d bugs.{RESET}" + msg = ( + f"{RED}{BOLD}" + "CRITICAL WARNING: PyTorch 2.10.0 & CuDNN Compatibility Issue Detected\n" + "--------------------------------------------------------------------------------\n" + f"Current Environment: PyTorch {torch.__version__} | CuDNN {version_float:.2f}\n\n" + "Issue: PyTorch 2.10.0 ships with cuDNN 9.10.2.21 which has a KNOWN BUG\n" + " in `nn.Conv3d` implementation causing SEVERE PERFORMANCE DEGRADATION\n" + " and EXCESSIVE MEMORY USAGE in multimodal models.\n\n" + "Solution: You MUST upgrade CuDNN to version 9.15+ to ensure correctness.\n\n" + "Run the following command immediately to fix:\n" + " pip install nvidia-cudnn-cu12==9.16.0.29\n\n" + "Or you can disable this check by setting env var SGLANG_DISABLE_CUDNN_CHECK=1\n" + "--------------------------------------------------------------------------------\n" + f"{RESET}" ) + raise RuntimeError(msg) + else: + RED = "\033[91m" + RESET = "\033[0m" + logger.warning( + f"{RED}WARNING: Could not determine CuDNN version for torch==2.10.0. " + f"Please ensure CuDNN >= 9.15 to avoid nn.Conv3d bugs.{RESET}" + ) def check_lora_server_args(self): assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive" diff --git a/scripts/ci/cuda/ci_install_dependency.sh b/scripts/ci/cuda/ci_install_dependency.sh index 381a05dc9861..54039c6368fa 100755 --- a/scripts/ci/cuda/ci_install_dependency.sh +++ b/scripts/ci/cuda/ci_install_dependency.sh @@ -156,6 +156,7 @@ if [ -n "$OPTIONAL_DEPS" ]; then fi echo "Installing python extras: [${EXTRAS}]" +$PIP_CMD uninstall torch || true $PIP_CMD install -e "python[${EXTRAS}]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX # Fix CUDA version mismatch between torch and torchaudio. @@ -261,6 +262,15 @@ if [ "$IS_BLACKWELL" = "1" ]; then else $PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall $PIP_INSTALL_SUFFIX fi + +# Set LD_LIBRARY_PATH to use pip-installed cuDNN instead of PyTorch's bundled cuDNN +# This is critical for PyTorch 2.10+ which ships with cuDNN 9.10.2.21 that has Conv3D performance issues +CUDNN_PATH=$(python3 -c "import nvidia.cudnn; print(nvidia.cudnn.__file__)" 2>/dev/null | xargs dirname | xargs dirname 2>/dev/null || echo "") +if [ -n "$CUDNN_PATH" ] && [ -d "$CUDNN_PATH/lib" ]; then + export LD_LIBRARY_PATH="$CUDNN_PATH/lib:$LD_LIBRARY_PATH" + echo "Set LD_LIBRARY_PATH to use pip-installed cuDNN: $CUDNN_PATH/lib" +fi + $PIP_CMD uninstall xformers || true # Install flashinfer-jit-cache with caching and retry logic (flashinfer.ai can have transient DNS issues) diff --git a/sgl-kernel/Dockerfile b/sgl-kernel/Dockerfile index 50cefe427af5..c27ffae5ba40 100644 --- a/sgl-kernel/Dockerfile +++ b/sgl-kernel/Dockerfile @@ -79,10 +79,10 @@ RUN set -eux; \ RUN --mount=type=cache,id=sgl-kernel-pip,target=/root/.cache/pip \ set -eux; \ case "${CUDA_VERSION}" in \ - 13.0) TORCH_VER=2.9.1; CU_TAG=cu130 ;; \ - 12.9) TORCH_VER=2.9.1; CU_TAG=cu128 ;; \ - 12.8) TORCH_VER=2.9.1; CU_TAG=cu128 ;; \ - *) TORCH_VER=2.9.1; CU_TAG=cu126 ;; \ + 13.0) TORCH_VER=2.10.0; CU_TAG=cu130 ;; \ + 12.9) TORCH_VER=2.10.0; CU_TAG=cu128 ;; \ + 12.8) TORCH_VER=2.10.0; CU_TAG=cu128 ;; \ + *) TORCH_VER=2.10.0; CU_TAG=cu126 ;; \ esac; \ ${PYTHON_ROOT_PATH}/bin/pip install torch==${TORCH_VER} --index-url https://${PYTORCH_MIRROR}/whl/${CU_TAG}; \ ${PYTHON_ROOT_PATH}/bin/pip install ninja setuptools==75.0.0 wheel==0.41.0 numpy uv scikit-build-core --index-url ${PIP_DEFAULT_INDEX} diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md index 877f220f06d2..19272545ce00 100644 --- a/sgl-kernel/README.md +++ b/sgl-kernel/README.md @@ -12,7 +12,7 @@ sgl-kernel provides optimized compute primitives for LLM inference engines, enabling efficient inference for large language models and vision-language models through custom kernel operations. It has been used by [LightLLM](https://github.com/ModelTC/LightLLM), [SGLang](https://github.com/sgl-project/sglang) and so on. ## Installation -Requires torch == 2.9.1 +Requires torch == 2.10.0 ```bash # Latest version