Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion .github/workflows/execute-notebook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,28 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Build sgl-kernel locally for PyTorch 2.10 compatibility
timeout-minutes: 30
run: |
# Install CMake 3.26+ required by sgl-kernel
pip install "cmake>=3.26"
cmake --version
cd sgl-kernel
pip install scikit-build-core>=0.10 ninja wheel numpy uv
pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu129
make build
# Rename wheel to match expected pattern (linux_x86_64 -> manylinux2014_x86_64)
for whl in dist/sgl_kernel-*-linux_x86_64.whl; do
if [ -f "$whl" ]; then
newname=$(echo "$whl" | sed 's/linux_x86_64/manylinux2014_x86_64/')
mv "$whl" "$newname"
echo "Renamed $whl to $newname"
fi
done

- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
CUSTOM_BUILD_SGL_KERNEL=true bash scripts/ci/cuda/ci_install_dependency.sh
pip install -r docs/requirements.txt
apt-get update && apt-get install -y pandoc parallel retry
ln -sf "$(which python3)" /usr/bin/python
Expand Down
41 changes: 38 additions & 3 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -804,13 +804,15 @@ jobs:
# =============================================== jit-kernel ====================================================

jit-kernel-unit-test:
needs: [check-changes, call-gate]
needs: [check-changes, call-gate, sgl-kernel-build-wheels]
# Skip for scheduled runs and when target_stage is set
if: |
always() &&
github.event_name != 'schedule' &&
inputs.test_parallel_dispatch != true &&
!inputs.target_stage &&
needs.check-changes.outputs.jit_kernel == 'true'
needs.check-changes.outputs.jit_kernel == 'true' &&
(needs.sgl-kernel-build-wheels.result == 'success' || needs.sgl-kernel-build-wheels.result == 'skipped')
runs-on: 1-gpu-runner
timeout-minutes: 240
env:
Expand All @@ -820,10 +822,43 @@ jobs:
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

- name: Clean up disk
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true

- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9

- name: Build sgl-kernel locally if wheel not available
if: needs.check-changes.outputs.sgl_kernel != 'true'
timeout-minutes: 30
run: |
# Install CMake 3.26+ required by sgl-kernel
pip install "cmake>=3.26"
cmake --version
cd sgl-kernel
pip install scikit-build-core>=0.10 ninja wheel numpy uv
pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu129
make build
# Rename wheel to match expected pattern (linux_x86_64 -> manylinux2014_x86_64)
for whl in dist/sgl_kernel-*-linux_x86_64.whl; do
if [ -f "$whl" ]; then
newname=$(echo "$whl" | sed 's/linux_x86_64/manylinux2014_x86_64/')
mv "$whl" "$newname"
echo "Renamed $whl to $newname"
fi
done

- name: Install dependencies
timeout-minutes: 20
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
CUSTOM_BUILD_SGL_KERNEL=true bash scripts/ci/cuda/ci_install_dependency.sh

- name: Run test
timeout-minutes: 30
Expand Down
6 changes: 3 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ dependencies = [
"tiktoken",
"timm==1.0.16",
"torch_memory_saver==0.0.9",
"torch==2.9.1",
"torch==2.10.0",
"torchao==0.9.0",
"torchaudio==2.9.1",
"torchcodec==0.9.1 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec 0.9.1 for torch 2.9.x. Not available on Linux ARM.
"torchaudio==2.10.0",
"torchcodec==0.9.1 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')",
"av ; sys_platform == 'linux' and (platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'armv7l')",
"torchvision",
"tqdm",
Expand Down
92 changes: 53 additions & 39 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# NOTE: PyTorch 2.10.0 still requires cuDNN >= 9.15 for Conv3D operations.
# The cuDNN version check was removed from runtime but CI enforces
# nvidia-cudnn-cu12==9.16.0.29 installation to avoid performance regression.
"""The arguments of the server."""

from __future__ import annotations
Expand Down Expand Up @@ -5718,14 +5721,12 @@ def check_server_args(self):
f"Invalid value: '{self.served_model_name}'"
)

# Check cuDNN compatibility for PyTorch 2.10+
self.check_torch_cudnn_compatibility()

# Check LoRA
self.check_lora_server_args()

# torch 2.9.1 has compatibility issues with cuDNN 9.14 and below,
# causing extremely slow nn.Conv3d performance.
# TODO(yhyang201): Remove this check when sglang no longer uses torch 2.9.1.
self.check_torch_2_9_1_cudnn_compatibility()

# Check speculative decoding
if self.speculative_algorithm is not None:
assert (
Expand Down Expand Up @@ -5834,48 +5835,61 @@ def check_server_args(self):
"When enabling two batch overlap, moe_a2a_backend cannot be 'none'."
)

def check_torch_2_9_1_cudnn_compatibility(self):
def check_torch_cudnn_compatibility(self):
"""Check cuDNN compatibility for PyTorch 2.10+.

PyTorch 2.10.0 ships with cuDNN 9.10.2.21 which has Conv3D performance regression.
This check warns users to upgrade to cuDNN 9.15+ for multimodal models.
"""
if get_bool_env_var("SGLANG_DISABLE_CUDNN_CHECK"):
return

if self.get_model_config().is_multimodal:
import torch
# Only check for multimodal models which use Conv3D
if (
not hasattr(self, "get_model_config")
or not self.get_model_config().is_multimodal
):
return

import torch

if torch_release[:3] == (2, 9, 1):
torch_version = torch.__version__.split("+", 1)[0]
# Check for PyTorch 2.10.0+ (which has the cuDNN issue)
if torch_version.startswith("2.10"):
cudnn_version = None
try:
cudnn_version = torch.backends.cudnn.version()
except Exception:
cudnn_version = None
try:
cudnn_version = torch.backends.cudnn.version()
except Exception:
cudnn_version = None
if cudnn_version is not None:
version_float = float(str(cudnn_version)[:3]) / 100
if version_float < 9.15:
RED = "\033[91m"
BOLD = "\033[1m"
RESET = "\033[0m"
msg = (
f"{RED}{BOLD}"
"CRITICAL WARNING: PyTorch 2.9.1 & CuDNN Compatibility Issue Detected\n"
"--------------------------------------------------------------------------------\n"
f"Current Environment: PyTorch {torch.__version__} | CuDNN {version_float:.2f}\n\n"
"Issue: There is a KNOWN BUG in PyTorch 2.9.1's `nn.Conv3d` implementation\n"
" when used with CuDNN versions older than 9.15. This can cause\n"
" SEVERE PERFORMANCE DEGRADATION and EXCESSIVE MEMORY USAGE.\n\n"
"Reference: https://github.com/pytorch/pytorch/issues/168167\n\n"
"Solution: You MUST upgrade CuDNN to version 9.15+ to ensure correctness.\n\n"
"Run the following command immediately to fix:\n"
" pip install nvidia-cudnn-cu12==9.16.0.29\n\n"
"Or you can disable this check by setting env var SGLANG_DISABLE_CUDNN_CHECK=1\n"
"--------------------------------------------------------------------------------\n"
f"{RESET}"
)
raise RuntimeError(msg)
else:
if cudnn_version is not None:
version_float = float(str(cudnn_version)[:3]) / 100
if version_float < 9.15:
RED = "\033[91m"
BOLD = "\033[1m"
RESET = "\033[0m"
logger.warning(
f"{RED}WARNING: Could not determine CuDNN version for torch==2.9.1. Please ensure CuDNN >= 9.15 to avoid nn.Conv3d bugs.{RESET}"
msg = (
f"{RED}{BOLD}"
"CRITICAL WARNING: PyTorch 2.10.0 & CuDNN Compatibility Issue Detected\n"
"--------------------------------------------------------------------------------\n"
f"Current Environment: PyTorch {torch.__version__} | CuDNN {version_float:.2f}\n\n"
"Issue: PyTorch 2.10.0 ships with cuDNN 9.10.2.21 which has a KNOWN BUG\n"
" in `nn.Conv3d` implementation causing SEVERE PERFORMANCE DEGRADATION\n"
" and EXCESSIVE MEMORY USAGE in multimodal models.\n\n"
"Solution: You MUST upgrade CuDNN to version 9.15+ to ensure correctness.\n\n"
"Run the following command immediately to fix:\n"
" pip install nvidia-cudnn-cu12==9.16.0.29\n\n"
"Or you can disable this check by setting env var SGLANG_DISABLE_CUDNN_CHECK=1\n"
"--------------------------------------------------------------------------------\n"
f"{RESET}"
)
raise RuntimeError(msg)
else:
RED = "\033[91m"
RESET = "\033[0m"
logger.warning(
f"{RED}WARNING: Could not determine CuDNN version for torch==2.10.0. "
f"Please ensure CuDNN >= 9.15 to avoid nn.Conv3d bugs.{RESET}"
)

def check_lora_server_args(self):
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
Expand Down
10 changes: 10 additions & 0 deletions scripts/ci/cuda/ci_install_dependency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ if [ -n "$OPTIONAL_DEPS" ]; then
fi
echo "Installing python extras: [${EXTRAS}]"

$PIP_CMD uninstall torch || true
$PIP_CMD install -e "python[${EXTRAS}]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX

# Fix CUDA version mismatch between torch and torchaudio.
Expand Down Expand Up @@ -261,6 +262,15 @@ if [ "$IS_BLACKWELL" = "1" ]; then
else
$PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall $PIP_INSTALL_SUFFIX
fi

# Set LD_LIBRARY_PATH to use pip-installed cuDNN instead of PyTorch's bundled cuDNN
# This is critical for PyTorch 2.10+ which ships with cuDNN 9.10.2.21 that has Conv3D performance issues
CUDNN_PATH=$(python3 -c "import nvidia.cudnn; print(nvidia.cudnn.__file__)" 2>/dev/null | xargs dirname | xargs dirname 2>/dev/null || echo "")
if [ -n "$CUDNN_PATH" ] && [ -d "$CUDNN_PATH/lib" ]; then
export LD_LIBRARY_PATH="$CUDNN_PATH/lib:$LD_LIBRARY_PATH"
echo "Set LD_LIBRARY_PATH to use pip-installed cuDNN: $CUDNN_PATH/lib"
fi

$PIP_CMD uninstall xformers || true

# Install flashinfer-jit-cache with caching and retry logic (flashinfer.ai can have transient DNS issues)
Expand Down
8 changes: 4 additions & 4 deletions sgl-kernel/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ RUN set -eux; \
RUN --mount=type=cache,id=sgl-kernel-pip,target=/root/.cache/pip \
set -eux; \
case "${CUDA_VERSION}" in \
13.0) TORCH_VER=2.9.1; CU_TAG=cu130 ;; \
12.9) TORCH_VER=2.9.1; CU_TAG=cu128 ;; \
12.8) TORCH_VER=2.9.1; CU_TAG=cu128 ;; \
*) TORCH_VER=2.9.1; CU_TAG=cu126 ;; \
13.0) TORCH_VER=2.10.0; CU_TAG=cu130 ;; \
12.9) TORCH_VER=2.10.0; CU_TAG=cu128 ;; \
12.8) TORCH_VER=2.10.0; CU_TAG=cu128 ;; \
*) TORCH_VER=2.10.0; CU_TAG=cu126 ;; \
esac; \
${PYTHON_ROOT_PATH}/bin/pip install torch==${TORCH_VER} --index-url https://${PYTORCH_MIRROR}/whl/${CU_TAG}; \
${PYTHON_ROOT_PATH}/bin/pip install ninja setuptools==75.0.0 wheel==0.41.0 numpy uv scikit-build-core --index-url ${PIP_DEFAULT_INDEX}
Expand Down
2 changes: 1 addition & 1 deletion sgl-kernel/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
sgl-kernel provides optimized compute primitives for LLM inference engines, enabling efficient inference for large language models and vision-language models through custom kernel operations. It has been used by [LightLLM](https://github.com/ModelTC/LightLLM), [SGLang](https://github.com/sgl-project/sglang) and so on.

## Installation
Requires torch == 2.9.1
Requires torch == 2.10.0

```bash
# Latest version
Expand Down
Loading