diff --git a/.claude/skills/add-jit-kernel/SKILL.md b/.claude/skills/add-jit-kernel/SKILL.md new file mode 100644 index 000000000000..232c9eb99be0 --- /dev/null +++ b/.claude/skills/add-jit-kernel/SKILL.md @@ -0,0 +1,561 @@ +--- +name: add-jit-kernel +description: Step-by-step tutorial for adding a new lightweight JIT CUDA kernel to sglang's jit_kernel module +--- + +# Tutorial: Adding a New JIT Kernel to SGLang + +This tutorial walks through adding a simple element-wise scale operation as a JIT kernel. We'll implement `scale(x, factor) = x * factor` to demonstrate the complete workflow. + +## Goal + +Add a new operation that scales each element of a tensor by a scalar factor: + +- Input: tensor `x` (CUDA) and scalar `factor` (float, passed as C++ template argument) +- Output: `x * factor` (element-wise), allocated internally +- Supported dtypes: **FP16 (`torch.float16`), BF16 (`torch.bfloat16`), FP32 (`torch.float32`)** + +## When to use JIT vs AOT (`sgl-kernel`) + +- **JIT (`jit_kernel`)**: prefer this first for kernels that do **not** depend on CUTLASS or another large C++ project. It is the default choice for lightweight kernels that benefit from rapid iteration and first-use compilation. +- **AOT (`sgl-kernel`)**: prefer this when the kernel **does** depend on CUTLASS or another large C++ project, or when it should live in `sgl-kernel/` and participate in the wheel build / torch op registration flow. +- **Exception**: kernels that depend on `flashinfer`, or on CUTLASS that is already provided through `flashinfer`, can still be implemented as `jit_kernel`. + +--- + +## Common Abstractions in `python/sglang/jit_kernel/include/sgl_kernel/` + +**Always prefer these abstractions over raw CUDA primitives.** They provide safety, readability, and consistency with the rest of the codebase. + +**Important include rule:** for every `#include ` line, add a short trailing comment explaining why that header is included (for example `// For TensorMatcher, SymbolicSize, SymbolicDevice`). This matches the current JIT kernel style and keeps include usage self-documenting. + +### `utils.h` — Host-side utilities + +```cpp +#include +``` + +- **`host::RuntimeCheck(cond, args...)`** — Assert a condition at runtime; throws `PanicError` with file/line info on failure. Prefer this over bare `assert`. +- **`host::Panic(args...)`** — Unconditionally throw a `PanicError` with a descriptive message. +- **`host::div_ceil(a, b)`** — Integer ceiling division `(a + b - 1) / b`. +- **`host::irange(n)`** / **`host::irange(start, end)`** — Range views for cleaner loops. +- **`host::pointer::offset(ptr, offsets...)`** — Byte-safe pointer arithmetic on `void*`. Use this instead of raw casts. + +### `utils.cuh` — Device-side utilities + `LaunchKernel` + +```cpp +#include +``` + +- **Type aliases**: `fp16_t`, `bf16_t`, `fp32_t`, `fp8_e4m3_t`, `fp8_e5m2_t` and their packed variants `fp16x2_t`, `bf16x2_t`, `fp32x2_t`, etc. +- **`SGL_DEVICE`** — Expands to `__forceinline__ __device__`. Use on all device functions. +- **`device::kWarpThreads`** — Constant `32`. +- **`device::load_as(ptr, offset)`** / **`device::store_as(ptr, val, offset)`** — Type-safe loads/stores from `void*`. +- **`device::pointer::offset(ptr, offsets...)`** — Pointer arithmetic on device. +- **`host::LaunchKernel(grid, block, device_or_stream [, smem])`** — RAII kernel launcher that: + - Resolves the CUDA stream from a `DLDevice` via TVM-FFI automatically. + - Checks the CUDA error with file/line info after launch via `operator()(kernel, args...)`. + - Supports `.enable_pdl(bool)` for PDL (Programmatic Dependent Launch, SM90+). +- **`host::RuntimeDeviceCheck(cudaError_t)`** — Check a CUDA error; throw on failure. + +### `tensor.h` — Tensor validation (`TensorMatcher`, Symbolic types) + +```cpp +#include +``` + +This is the **primary validation API** for all kernel launchers. Use it to validate every `tvm::ffi::TensorView` argument. + +- **`host::SymbolicSize{"name"}`** — A named symbolic dimension. Call `.set_value(n)` to pin it, `.unwrap()` to extract after verification. +- **`host::SymbolicDType`** — Symbolic dtype. Use `.set_options()` to restrict allowed types. +- **`host::SymbolicDevice`** — Symbolic device. Use `.set_options()` to restrict to CUDA. +- **`host::TensorMatcher({dims...})`** — Fluent builder for tensor validation: + - `.with_dtype()` — require a specific C++ type (e.g. `fp16_t`) + - `.with_dtype()` — allow a set of types + - `.with_device(device_sym)` — require CUDA, bind device to symbol + - `.with_strides({strides...})` — validate strides (omit to require contiguous) + - `.verify(tensor_view)` — execute the check; throws `PanicError` with full context on failure; **chainable** (`verify(a).verify(b)` to check multiple tensors with the same shape) + +**Typical pattern:** +```cpp +auto N = SymbolicSize{"num_elements"}; +auto device = SymbolicDevice{}; +device.set_options(); +TensorMatcher({N}) // + .with_dtype() + .with_device(device) + .verify(dst) + .verify(src); // same shape, dtype, device as dst +const size_t n = N.unwrap(); +const DLDevice dev = device.unwrap(); +``` + +### `type.cuh` — `dtype_trait` and `packed_t` + +```cpp +#include +``` + +- **`dtype_trait`** — Static trait struct for each scalar type. Provides: + - `dtype_trait::from(value)` — convert from another type (e.g. `fp32_t` → `fp16_t`) + - `dtype_trait::abs/sqrt/rsqrt/exp/sin/cos(x)` — type-dispatched unary math (primarily for `fp32_t`) + - `dtype_trait::max/min(x, y)` — type-dispatched binary math (primarily for `fp32_t`) +- **`packed_t`** — Two-element packed alias: `packed_t` = `fp16x2_t`, `packed_t` = `bf16x2_t`, `packed_t` = `fp32x2_t`. Use for vectorized loads/stores. +- **`device::cast(value)`** — Type-safe cast using `dtype_trait`, e.g. `cast(v)`. + +### `vec.cuh` — Vectorized memory access (`AlignedVector`) + +```cpp +#include +``` + +- **`device::AlignedVector`** — Aligned storage for N elements of type T. N must be a power of two, `sizeof(T)*N <= 32`. Enables vectorized loads/stores for bandwidth efficiency. In terms of API/codegen constraints, the upper bound is 256-bit; in practice, 128-bit is the portable default, while 256-bit vectorization is typically only viable on `SM100+` and should be gated by an architecture check when needed. + - `.load(ptr, offset)` — vectorized load from `ptr[offset]` + - `.store(ptr, offset)` — vectorized store to `ptr[offset]` + - `.fill(value)` — fill all lanes + - `operator[](i)` — element access + +### `tile.cuh` — `tile::Memory` (strided memory access pattern) + +```cpp +#include +``` + +- `tile::Memory` is fundamentally a **1D cooperative accessor** over a contiguous region. +- **`device::tile::Memory::cta(blockDim.x)`** — Creates a tile accessor where each thread handles `tid = threadIdx.x` with stride `tsize` (for `cta(blockDim.x)`, this is `blockDim.x`). Common for loops over a 1D array. +- **`.load(ptr, offset)`** — loads `ptr[tid + offset * tsize]` +- **`.store(ptr, val, offset)`** — stores to `ptr[tid + offset * tsize]` +- **`.in_bound(n, offset)`** — boundary check + +For a **2D tile**, either flatten `(row, col)` into a linear tile index first, or compute the address manually with `ptr[row * stride + col]` using your thread/block coordinates. + +### `math.cuh` — Device math (`device::math::`) + +```cpp +#include +``` + +- `device::math::max/min(a, b)` — type-dispatched binary math via `dtype_trait` +- `device::math::abs/sqrt/rsqrt/exp/sin/cos(x)` — type-dispatched unary math via `dtype_trait` + +### `warp.cuh` — Warp-level primitives + +```cpp +#include +``` + +- `device::warp::reduce_sum(value)` — warp-level sum reduction via `__shfl_xor_sync` +- `device::warp::reduce_max(value)` — warp-level max reduction + +### `cta.cuh` — CTA-level primitives + +```cpp +#include +``` + +- `device::cta::reduce_max(value, smem, min_value)` — CTA-wide max using shared memory + warp reduction. Caller is responsible for a `__syncthreads()` after if the result in `smem[0]` is needed. + +### `atomic.cuh` — Atomic operations + +```cpp +#include +``` + +- `device::atomic::max(float* addr, float value)` — float atomic max (handles negative values correctly via bit tricks). + +### `runtime.cuh` — Occupancy and device info + +```cpp +#include +``` + +- `host::runtime::get_blocks_per_sm(kernel, block_dim)` — max active blocks per SM (occupancy) +- `host::runtime::get_sm_count(device_id)` — number of SMs on the device +- `host::runtime::get_cc_major(device_id)` — compute capability major version + +**Persistent kernel pattern** (cap blocks to SM count × occupancy): +```cpp +static const uint32_t max_occ = runtime::get_blocks_per_sm(kernel, kBlockSize); +static const uint32_t num_sm = runtime::get_sm_count(device.unwrap().device_id); +const auto num_blocks = std::min(num_sm * max_occ, div_ceil(n, kBlockSize)); +LaunchKernel(num_blocks, kBlockSize, device.unwrap())(kernel, params); +``` + +--- + +## Step 0 (optional): Generate a `.clangd` config for better IDE support + +```bash +python -m sglang.jit_kernel +``` + +--- + +## Step 1: Implement the CUDA kernel in `jit_kernel/csrc/` + +Create `python/sglang/jit_kernel/csrc/elementwise/scale.cuh`. + +The implementation fully uses the project abstractions described above: + +```cpp +#include // For TensorMatcher, SymbolicSize, SymbolicDevice +#include // For dtype_trait, fp16_t, bf16_t, fp32_t +#include // For RuntimeCheck, div_ceil +#include // For LaunchKernel, SGL_DEVICE +#include // For AlignedVector + +#include +#include + +namespace { + +// ---------------------------------------------------------------- +// Kernel: element-wise scale using vectorized 128-bit loads/stores +// T = fp16_t | bf16_t | fp32_t +// kVecN = number of elements per vector load (e.g. 8 for fp16) +// kFactor = scale factor encoded as kFactorNumer / kFactorDenom +// ---------------------------------------------------------------- +template +__global__ void scale_kernel(T* __restrict__ dst, + const T* __restrict__ src, + uint32_t n_vecs, + uint32_t n_remainder, + uint32_t n_total) { + constexpr float kFactor = static_cast(kFactorNumer) + / static_cast(kFactorDenom); + + using vec_t = device::AlignedVector; + + // --- vectorised body --- + const uint32_t vec_stride = blockDim.x * gridDim.x; + for (uint32_t vi = blockIdx.x * blockDim.x + threadIdx.x; + vi < n_vecs; + vi += vec_stride) { + vec_t v; + v.load(src, vi); +#pragma unroll + for (int i = 0; i < kVecN; ++i) { + v[i] = static_cast(static_cast(v[i]) * kFactor); + } + v.store(dst, vi); + } + + // --- scalar tail --- + const uint32_t base = n_vecs * kVecN; + const uint32_t scalar_stride = blockDim.x * gridDim.x; + for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + i < n_remainder; + i += scalar_stride) { + dst[base + i] = static_cast(static_cast(src[base + i]) * kFactor); + } +} + +// ---------------------------------------------------------------- +// Launcher: validates tensors, selects vector width, launches kernel +// ---------------------------------------------------------------- +template +void scale(tvm::ffi::TensorView dst, tvm::ffi::TensorView src) { + using namespace host; + + // 1. Validate input tensors with TensorMatcher + SymbolicSize N = {"num_elements"}; + SymbolicDevice device_; + device_.set_options(); + + TensorMatcher({N}) // + .with_dtype() + .with_device(device_) + .verify(dst) + .verify(src); // same shape / dtype / device as dst + + const uint32_t n = static_cast(N.unwrap()); + const DLDevice device = device_.unwrap(); + + RuntimeCheck(n > 0, "scale: num_elements must be > 0, got ", n); + + // 2. Choose vector width for 128-bit loads (16 bytes) + // fp16/bf16: 8 elements × 2 bytes = 16 bytes + // fp32: 4 elements × 4 bytes = 16 bytes + constexpr int kVecN = 16 / sizeof(T); + const uint32_t n_vecs = n / kVecN; + const uint32_t n_remainder = n % kVecN; + + // 3. Launch + constexpr uint32_t kBlockSize = 256; + const uint32_t grid = div_ceil(std::max(n_vecs, n_remainder), kBlockSize); + + LaunchKernel(grid, kBlockSize, device)( + scale_kernel, + static_cast(dst.data_ptr()), + static_cast(src.data_ptr()), + n_vecs, + n_remainder, + n); +} + +} // namespace +``` + +**Key points:** + +- Include headers from `sgl_kernel/` — **not** raw CUDA headers for anything already covered +- Add a short trailing `// For ...` explanation to every `#include ` line +- Use `TensorMatcher` for all tensor validation; never manually check shape/dtype/device +- Use `AlignedVector` for vectorised 128-bit loads/stores — significant bandwidth win +- Use `LaunchKernel` — it resolves the stream and checks errors automatically +- Use `RuntimeCheck` for runtime assertions with useful error messages +- `fp16_t` / `bf16_t` / `fp32_t` are the project's type aliases (from `utils.cuh`) +- `device::cast` or `dtype_trait::from(val)` for cross-type conversions +- `device::math::` functions for device math instead of bare `__` intrinsics + +--- + +## Step 2: Add the Python wrapper in `jit_kernel/` + +Create `python/sglang/jit_kernel/scale.py`: + +```python +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch + +from sglang.jit_kernel.utils import cache_once, load_jit, make_cpp_args + +if TYPE_CHECKING: + from tvm_ffi.module import Module + + +@cache_once +def _jit_scale_module(dtype: torch.dtype, factor_numer: int, factor_denom: int) -> Module: + """Compile and cache the JIT scale module for a given dtype and factor.""" + args = make_cpp_args(dtype, factor_numer, factor_denom) + return load_jit( + "scale", + *args, + cuda_files=["elementwise/scale.cuh"], + cuda_wrappers=[("scale", f"scale<{args}>")], + ) + + +def scale(src: torch.Tensor, factor: float, out: torch.Tensor | None = None) -> torch.Tensor: + """ + Element-wise scale: dst = src * factor. + + Supported dtypes: torch.float16, torch.bfloat16, torch.float32. + + Parameters + ---------- + src : CUDA tensor (FP16 / BF16 / FP32) + factor : scale factor + out : optional pre-allocated output tensor (same shape/dtype as src) + + Returns + ------- + Scaled tensor (dst = src * factor). + """ + assert src.is_cuda, "src must be a CUDA tensor" + assert src.dtype in (torch.float16, torch.bfloat16, torch.float32), ( + f"Unsupported dtype {src.dtype}. Supported: float16, bfloat16, float32" + ) + if out is None: + out = torch.empty_like(src) + else: + assert out.shape == src.shape, "out shape must match src" + assert out.dtype == src.dtype, "out dtype must match src" + + # Encode factor as integer ratio; denom=1000 gives 3 decimal places of precision + factor_denom = 1000 + factor_numer = round(factor * factor_denom) + + module = _jit_scale_module(src.dtype, factor_numer, factor_denom) + module.scale(out, src) + return out +``` + +**Key points:** + +- Use `cache_once` — **not** `functools.lru_cache` (incompatible with `torch.compile`) +- `load_jit` first arg(s) form the unique build marker; same marker = same cached binary +- `cuda_wrappers`: `(export_name, kernel_symbol)` — `export_name` is called from Python +- `make_cpp_args(dtype, ...)` converts `torch.dtype` to C++ type alias: + +| `torch.dtype` | C++ type | +|--------------------|------------| +| `torch.float16` | `fp16_t` | +| `torch.bfloat16` | `bf16_t` | +| `torch.float32` | `fp32_t` | + +--- + +## Step 3 (optional): Tune JIT build flags + +```python +return load_jit( + "scale", + *args, + cuda_files=["elementwise/scale.cuh"], + cuda_wrappers=[("scale", f"scale<{args}>")], + extra_cuda_cflags=["-O3", "--use_fast_math"], +) +``` + +If your kernel requires SM90+, raise a clear Python error before calling `load_jit`: + +```python +if torch.cuda.get_device_capability()[0] < 9: + raise RuntimeError("This kernel requires SM90 (Hopper) or later") +``` + +--- + +## Step 4: Write tests (required) + +Create `python/sglang/jit_kernel/tests/test_scale.py`: + +```python +import pytest +import torch +from sglang.jit_kernel.scale import scale + + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +@pytest.mark.parametrize("size", [1, 127, 128, 1024, 4097]) # cover tail remainder +@pytest.mark.parametrize("factor", [0.5, 1.0, 2.0, 3.0]) +def test_scale_correctness(dtype, size, factor): + src = torch.randn(size, dtype=dtype, device="cuda") + out = scale(src, factor) + expected = src * factor + + rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-2, 1e-2) + torch.testing.assert_close(out, expected, rtol=rtol, atol=atol) + + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +def test_scale_out_param(dtype): + src = torch.randn(1024, dtype=dtype, device="cuda") + out = torch.empty_like(src) + result = scale(src, 2.0, out=out) + assert result is out + torch.testing.assert_close(out, src * 2.0, rtol=1e-2, atol=1e-2) + + +def test_scale_cpu_error(): + src = torch.randn(128, dtype=torch.float16) # CPU tensor + with pytest.raises(AssertionError, match="CUDA"): + scale(src, 2.0) + + +def test_scale_unsupported_dtype(): + src = torch.randint(0, 10, (128,), dtype=torch.int32, device="cuda") + with pytest.raises(AssertionError, match="Unsupported dtype"): + scale(src, 2.0) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) +``` + +--- + +## Step 5: Add a benchmark (required) + +Create `python/sglang/jit_kernel/benchmark/bench_scale.py`: + +```python +import itertools + +import torch +import triton +import triton.testing + +from sglang.jit_kernel.benchmark.utils import ( + DEFAULT_DEVICE, + DEFAULT_DTYPE, + get_benchmark_range, + run_benchmark, +) +from sglang.jit_kernel.scale import scale as jit_scale + + +SIZE_LIST = get_benchmark_range( + full_range=[2**n for n in range(10, 20)], # 1K … 512K elements + ci_range=[4096, 65536], +) + +configs = list(itertools.product(SIZE_LIST)) + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["size"], + x_vals=configs, + line_arg="provider", + line_vals=["jit", "torch"], + line_names=["SGL JIT Kernel", "PyTorch"], + styles=[("blue", "-"), ("red", "--")], + ylabel="us", + plot_name="scale-performance", + args={}, + ) +) +def benchmark(size: int, provider: str): + src = torch.randn(size, dtype=DEFAULT_DTYPE, device=DEFAULT_DEVICE) + factor = 2.0 + + if provider == "jit": + fn = lambda: jit_scale(src, factor) + else: + fn = lambda: src * factor + + return run_benchmark(fn) + + +if __name__ == "__main__": + benchmark.run(print_data=True) +``` + +Run: + +```bash +python python/sglang/jit_kernel/benchmark/bench_scale.py +``` + +--- + +## Troubleshooting + +- **JIT compilation fails**: ensure the `.cuh` file is under `python/sglang/jit_kernel/csrc/`; reduce template argument combinations +- **CUDA crash / illegal memory access**: `CUDA_LAUNCH_BLOCKING=1`; `compute-sanitizer --tool memcheck python ...` +- **Unstable benchmark results**: `run_benchmark` uses CUDA-graph-based timing by default + +--- + +## References + +- `docs/developer_guide/development_jit_kernel_guide.md` +- `python/sglang/jit_kernel/utils.py` — `cache_once`, `load_jit`, `make_cpp_args` +- `python/sglang/jit_kernel/include/sgl_kernel/tensor.h` — `TensorMatcher`, `SymbolicSize/DType/Device` +- `python/sglang/jit_kernel/include/sgl_kernel/utils.cuh` — type aliases, `LaunchKernel`, `SGL_DEVICE` +- `python/sglang/jit_kernel/include/sgl_kernel/vec.cuh` — `AlignedVector` +- `python/sglang/jit_kernel/include/sgl_kernel/tile.cuh` — `tile::Memory` +- `python/sglang/jit_kernel/include/sgl_kernel/type.cuh` — `dtype_trait`, `packed_t`, `device::cast` +- `python/sglang/jit_kernel/include/sgl_kernel/math.cuh` — `device::math::` +- `python/sglang/jit_kernel/include/sgl_kernel/warp.cuh` — `warp::reduce_sum/max` +- `python/sglang/jit_kernel/include/sgl_kernel/cta.cuh` — `cta::reduce_max` +- `python/sglang/jit_kernel/include/sgl_kernel/atomic.cuh` — `atomic::max` +- `python/sglang/jit_kernel/include/sgl_kernel/runtime.cuh` — occupancy / SM count helpers +- `python/sglang/jit_kernel/csrc/add_constant.cuh` — minimal runnable reference +- `python/sglang/jit_kernel/csrc/elementwise/rmsnorm.cuh` — real example using `TensorMatcher` + `LaunchKernel` + `tile::Memory` +- `python/sglang/jit_kernel/csrc/elementwise/qknorm.cuh` — real example using `runtime::get_blocks_per_sm` + persistent kernel pattern +- `python/sglang/jit_kernel/benchmark/utils.py` — benchmark helpers + +## Summary of Files Created + +``` +python/sglang/jit_kernel/csrc/elementwise/scale.cuh # NEW: CUDA kernel +python/sglang/jit_kernel/scale.py # NEW: Python wrapper +python/sglang/jit_kernel/tests/test_scale.py # NEW: Tests +python/sglang/jit_kernel/benchmark/bench_scale.py # NEW: Benchmark +``` diff --git a/.claude/skills/add-sgl-kernel/SKILL.md b/.claude/skills/add-sgl-kernel/SKILL.md new file mode 100644 index 000000000000..07767c16b68a --- /dev/null +++ b/.claude/skills/add-sgl-kernel/SKILL.md @@ -0,0 +1,364 @@ +--- +name: add-sgl-kernel +description: Step-by-step tutorial for adding a heavyweight AOT CUDA/C++ kernel to sgl-kernel (including tests & benchmarks) +--- + +# Tutorial: Adding a New Kernel to `sgl-kernel` (AOT / Heavyweight) + +This tutorial walks through adding a simple element-wise scale operation as an AOT kernel. We'll implement `scale(x, factor) = x * factor` to demonstrate the complete workflow. + +## Goal + +Add a new operation that scales each element of a tensor by a scalar factor: + +- Input: tensor `x` (CUDA) and scalar `factor` (float) +- Output: `x * factor` (element-wise, in-place or into pre-allocated `out`) +- Supported dtypes: **FP16 (`torch.float16`), BF16 (`torch.bfloat16`), FP32 (`torch.float32`)** + - Dispatched via `DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16` macro (defined in `sgl-kernel/include/utils.h`) + +## Two rules of thumb (must follow) + +1. **Prefer `python/sglang/jit_kernel` first** when the kernel does **not** depend on CUTLASS or another large C++ project. This is the default path for lightweight kernels that benefit from rapid iteration. +2. **Prefer `sgl-kernel`** when the kernel **does** depend on CUTLASS or another large C++ project, or when it should be part of the AOT wheel / torch op registration flow. +3. **Exception**: if the dependency is `flashinfer`, or CUTLASS that is already provided through `flashinfer`, the kernel can still be implemented as `jit_kernel`. + +In addition, every new kernel must ship with: + +- **Tests** (pytest) +- **A benchmark script** (triton.testing) + +--- + +## Repository integration map + +You will typically touch these files/areas: + +- Implementation: `sgl-kernel/csrc/elementwise/scale.cu` (pick the right subdirectory) +- Public declarations: `sgl-kernel/include/sgl_kernel_ops.h` +- Torch extension registration: `sgl-kernel/csrc/common_extension.cc` +- Build: `sgl-kernel/CMakeLists.txt` (`set(SOURCES ...)`) +- Python API: `sgl-kernel/python/sgl_kernel/` and `sgl-kernel/python/sgl_kernel/__init__.py` +- Tests: `sgl-kernel/tests/test_scale.py` +- Benchmarks: `sgl-kernel/benchmark/bench_scale.py` + +--- + +## Step 1: Implement the kernel in `csrc/` + +Pick the right subdirectory: + +- `csrc/elementwise/` — for element-wise ops (our example) +- `csrc/gemm/`, `csrc/attention/`, `csrc/moe/` — for other categories + +Create `sgl-kernel/csrc/elementwise/scale.cu`: + +```cpp +#include +#include +#include + +#include "utils.h" // DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16 + +// scale_kernel: out[i] = input[i] * factor +// Supports float, half (__half), __nv_bfloat16 via template T +template +__global__ void scale_kernel(T* __restrict__ out, + const T* __restrict__ input, + float factor, + int64_t n) { + int64_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx] = static_cast(static_cast(input[idx]) * factor); + } +} + +void scale(at::Tensor& out, const at::Tensor& input, double factor) { + TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor"); + TORCH_CHECK(input.is_contiguous(), "input must be contiguous"); + TORCH_CHECK(out.is_cuda(), "out must be a CUDA tensor"); + TORCH_CHECK(out.is_contiguous(), "out must be contiguous"); + TORCH_CHECK(out.sizes() == input.sizes(), "out and input must have the same shape"); + TORCH_CHECK(out.scalar_type() == input.scalar_type(), + "out and input must have the same dtype"); + + const int64_t n = input.numel(); + const int threads = 256; + const int blocks = (n + threads - 1) / threads; + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + + // Dispatches over float, float16, bfloat16 + DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] { + scale_kernel<<>>( + static_cast(out.data_ptr()), + static_cast(input.data_ptr()), + static_cast(factor), + n); + cudaError_t status = cudaGetLastError(); + TORCH_CHECK(status == cudaSuccess, + "scale_kernel launch failed: ", cudaGetErrorString(status)); + return true; + }); +} +``` + +**Key points:** + +- Use `at::Tensor` (PyTorch tensors), `TORCH_CHECK` for validation, `at::cuda::getCurrentCUDAStream()` for stream +- `DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16` covers `float`, `half` (FP16), `__nv_bfloat16` (BF16) +- Add device error checking after every kernel launch +- If a kernel only works on certain architectures, enforce that with `TORCH_CHECK` and skip logic in tests + +--- + +## Step 2: Add a C++ declaration in `include/sgl_kernel_ops.h` + +Edit `sgl-kernel/include/sgl_kernel_ops.h`, add to the elementwise section: + +```cpp +void scale(at::Tensor& out, const at::Tensor& input, double factor); +``` + +--- + +## Step 3: Register the op in `csrc/common_extension.cc` + +Edit `sgl-kernel/csrc/common_extension.cc`, inside `TORCH_LIBRARY_FRAGMENT(sgl_kernel, m)`: + +```cpp +// From csrc/elementwise +m.def("scale(Tensor! out, Tensor input, float factor) -> ()"); +m.impl("scale", torch::kCUDA, &scale); +``` + +**Key points:** + +- `Tensor!` means in-place / mutable output argument +- The schema is important for `torch.compile` and for consistent call signatures +- If your underlying C++ API uses `float` but PyTorch bindings expect `double`, the implicit cast is fine for scalars; use shims if needed for other types + +--- + +## Step 4: Add the new source file to `CMakeLists.txt` + +Edit `sgl-kernel/CMakeLists.txt`, add to `set(SOURCES ...)`: + +```cmake +csrc/elementwise/scale.cu +``` + +**Key points:** + +- Keep the list **alphabetically sorted** (the file explicitly requires this) +- If the kernel has arch constraints, reflect that in tests/benchmarks via skip logic + +--- + +## Step 5: Expose a Python API under `sgl-kernel/python/sgl_kernel/` + +Prefer following the existing module organization first. For elementwise kernels, the usual pattern is: + +- implement the Python wrapper in `sgl-kernel/python/sgl_kernel/elementwise.py` +- then re-export it from `sgl-kernel/python/sgl_kernel/__init__.py` + +For example, in `sgl-kernel/python/sgl_kernel/elementwise.py`, add: + +```python +import torch + +def scale( + input: torch.Tensor, + factor: float, + out: torch.Tensor | None = None, +) -> torch.Tensor: + """ + Element-wise scale: out = input * factor. + + Supported dtypes: torch.float16, torch.bfloat16, torch.float32. + + Parameters + ---------- + input : CUDA input tensor + factor : scale factor (float) + out : optional pre-allocated CUDA output tensor (same shape/dtype as input) + """ + if out is None: + out = torch.empty_like(input) + torch.ops.sgl_kernel.scale.default(out, input, factor) + return out +``` + +Then re-export it from `sgl-kernel/python/sgl_kernel/__init__.py` following the existing import style used by other kernels. + +--- + +## Step 6: Write tests (required) + +Create `sgl-kernel/tests/test_scale.py`: +```python +import pytest + +import torch +import sgl_kernel + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +@pytest.mark.parametrize("size", [128, 1024, 4096, 65536]) +@pytest.mark.parametrize("factor", [0.5, 1.0, 2.0]) +def test_scale_correctness(dtype, size, factor): + input = torch.randn(size, dtype=dtype, device="cuda") + out = torch.empty_like(input) + + result = sgl_kernel.scale(input, factor, out=out) + assert result is out + + expected = input * factor + rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-2, 1e-2) + torch.testing.assert_close(out, expected, rtol=rtol, atol=atol) + + +def test_scale_shape_mismatch(): + input = torch.randn(128, dtype=torch.float16, device="cuda") + out = torch.empty(256, dtype=torch.float16, device="cuda") + with pytest.raises(RuntimeError, match="same shape"): + sgl_kernel.scale(input, 2.0, out=out) + + +def test_scale_cpu_input(): + input = torch.randn(128, dtype=torch.float16) # CPU + out = torch.empty_like(input) + with pytest.raises(RuntimeError, match="CUDA"): + sgl_kernel.scale(input, 2.0, out=out) + + +if __name__ == "__main__": + pytest.main([__file__, "-q"]) +``` + +--- + +## Step 7: Add a benchmark (required) + +Create `sgl-kernel/benchmark/bench_scale.py`: + +```python +import itertools +import os + +import torch +import triton +import triton.testing + +import sgl_kernel + +IS_CI = ( + os.getenv("CI", "false").lower() == "true" + or os.getenv("GITHUB_ACTIONS", "false").lower() == "true" +) + +dtypes = [torch.float16] if IS_CI else [torch.float16, torch.bfloat16, torch.float32] +sizes = [4096] if IS_CI else [2**n for n in range(10, 20)] # 1K … 512K +factors = [2.0] + +configs = list(itertools.product(dtypes, sizes)) + + +def torch_scale(input: torch.Tensor, factor: float) -> torch.Tensor: + return input * factor + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["dtype", "size"], + x_vals=configs, + line_arg="provider", + line_vals=["sglang", "torch"], + line_names=["SGL Kernel", "PyTorch"], + styles=[("green", "-"), ("red", "--")], + ylabel="µs (median)", + plot_name="scale-performance", + args={}, + ) +) +def benchmark(dtype, size, provider): + input = torch.randn(size, dtype=dtype, device="cuda") + out = torch.empty_like(input) + factor = 2.0 + + if provider == "sglang": + fn = lambda: sgl_kernel.scale(input, factor, out=out) + else: + fn = lambda: torch_scale(input, factor) + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + fn, quantiles=[0.5, 0.2, 0.8] + ) + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + +if __name__ == "__main__": + benchmark.run(print_data=True) +``` + +--- + +## Step 8: Build + +Build: + +```bash +cd sgl-kernel +make build -j16 +``` + +If you need to limit host resource usage: + +```bash +cd sgl-kernel +make build -j1 MAX_JOBS=2 CMAKE_ARGS="-DSGL_KERNEL_COMPILE_THREADS=1" +``` + +--- + +## Step 9: Validate + +After building successfully, run the test and benchmark: + +```bash +pytest sgl-kernel/tests/test_scale.py -q +python sgl-kernel/benchmark/bench_scale.py +``` + +--- + +## Troubleshooting + +- **Async CUDA errors**: `CUDA_LAUNCH_BLOCKING=1` +- **Memory errors**: `compute-sanitizer --tool memcheck python ...` +- **Build is too slow / OOM**: reduce `MAX_JOBS` and `SGL_KERNEL_COMPILE_THREADS` +- **Binary bloat**: use `sgl-kernel/analyze_whl_kernel_sizes.py` +- **CMake sources list**: if your `.cu` file is missing from `SOURCES`, the symbol will be undefined at link time + +--- + +## References + +- `sgl-kernel/README.md` +- `sgl-kernel/include/sgl_kernel_ops.h` +- `sgl-kernel/csrc/common_extension.cc` +- `sgl-kernel/CMakeLists.txt` +- `sgl-kernel/include/utils.h` — `DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16` macro and friends +- `sgl-kernel/csrc/elementwise/activation.cu` — reference for the FP16/BF16/FP32 dispatch pattern + +## Summary of Files Created/Modified + +``` +sgl-kernel/csrc/elementwise/scale.cu # NEW: CUDA kernel + launcher +sgl-kernel/include/sgl_kernel_ops.h # MODIFIED: C++ declaration +sgl-kernel/csrc/common_extension.cc # MODIFIED: schema + dispatch registration +sgl-kernel/CMakeLists.txt # MODIFIED: add source file (alphabetical) +sgl-kernel/python/sgl_kernel/elementwise.py # MODIFIED: Python wrapper +sgl-kernel/python/sgl_kernel/__init__.py # MODIFIED: re-export Python API +sgl-kernel/tests/test_scale.py # NEW: tests +sgl-kernel/benchmark/bench_scale.py # NEW: benchmark +``` diff --git a/.claude/skills/amd/enable-amd-model/SKILL.md b/.claude/skills/amd/enable-amd-model/SKILL.md new file mode 100644 index 000000000000..c1050326e569 --- /dev/null +++ b/.claude/skills/amd/enable-amd-model/SKILL.md @@ -0,0 +1,218 @@ +--- +name: enable-amd-model +description: End-to-end workflow for enabling a new model on AMD GPUs in SGLang. Covers HuggingFace architecture research, AMD backend selection (aiter/triton/wave/NSA with auto-selection logic), accuracy test file creation for MI30x and MI35x, CI workflow YAML updates, and documentation. Use when enabling a model on AMD, adding a model to AMD CI, or when the user mentions AMD model enablement. +--- + +# Enable a Model on AMD GPUs + +End-to-end workflow: architecture research, test files (MI30x + MI35x), CI YAML (2 workflow files x 3 edit locations each), documentation, and local validation. + +## Step 1: Architecture Research + +Fetch the model's HuggingFace config and SGLang model implementation. + +```bash +curl -s https://huggingface.co/{MODEL_PATH}/raw/main/config.json | python3 -m json.tool +``` + +Key fields to extract: + +| Field | What it tells you | +|---|---| +| `architectures` | Maps to `python/sglang/srt/models/` via `ModelRegistry` | +| `num_hidden_layers` | Total layer count | +| `num_attention_heads` / `num_key_value_heads` | GQA ratio | +| `kv_lora_rank` or `q_lora_rank` | MLA architecture indicator | +| `num_experts` / `num_experts_per_tok` | MoE configuration | +| `quantization_config` | FP8/INT4/MXFP4 format | + +### Determine AttentionArch + +SGLang has two attention architectures (defined in `python/sglang/srt/configs/model_config.py`): + +| AttentionArch | Models | Detection | +|---|---|---| +| **MLA** | DeepSeek-V2/V3/V3.2, Kimi-K2/K2.5, MiniCPM3, GLM-MoE-DSA, MistralLarge3, Pixtral, BailingMoe, SarvamMLA | Has `kv_lora_rank` in config, or architecture class in MLA list in `model_config.py` | +| **MHA** | Everything else (Llama, Mistral, Mixtral, Qwen, MiniMax, GLM-5, Grok, etc.) | Default | + +Read the model source in `python/sglang/srt/models/` and `python/sglang/srt/configs/model_config.py` to confirm. + +## Step 2: Backend Selection + +### Auto-selection (preferred) + +SGLang **auto-selects** the attention backend when `--attention-backend` is not set. On AMD (`is_hip()` is true), the logic in `server_args.py::_get_default_attn_backend()` is: + +| AttentionArch | Auto-selected backend | Condition | +|---|---|---| +| MHA | `aiter` | Default on HIP | +| MLA | `aiter` | When `num_kv_heads / tp_size` is 16 or 128 | +| MLA | `triton` | When head count is not 16 or 128 | +| NSA models (GLM-5) | `nsa` with `tilelang` prefill+decode | Auto-detected on HIP | + +**In most cases, just set `SGLANG_USE_AITER=1` and let auto-selection work.** Only override `--attention-backend` when you need a specific backend. + +### AMD-compatible backends + +From `ATTENTION_BACKEND_CHOICES` in `server_args.py`: + +| Backend | AMD support | When to use | +|---|---|---| +| `aiter` | AMD-specific | Default for most MHA and MLA models | +| `triton` | Cross-platform | Fallback when aiter doesn't support head count; also used for some base models | +| `wave` | AMD-specific | Wave kernel backend (experimental) | +| `nsa` | Yes (with tilelang) | For NSA models (GLM-5 style); prefill/decode default to `tilelang` on HIP | +| `torch_native` | Cross-platform | Generic fallback | +| `flex_attention` | Cross-platform | Torch flex attention | + +NSA sub-backends (`NSA_CHOICES`): `tilelang` (default on HIP), `aiter`, `flashmla_sparse`, `flashmla_kv`, `flashmla_auto`, `fa3`, `trtllm`. + +### Special cases + +- **Llama4**: auto-selects `aiter` on HIP +- **Diffusion models**: forces `triton` on HIP +- **Mixed prefill/decode**: use `--prefill-attention-backend` and `--decode-attention-backend` to override separately (e.g., Kimi-K2.5 uses `aiter` prefill + `triton` decode) +- **NSA models**: use `--nsa-prefill-backend tilelang --nsa-decode-backend tilelang` (or let auto-selection handle it) + +### Common server args by model type + +| Model characteristic | Additional args | +|---|---| +| MoE with many experts | `--ep-size 8` | +| Large models (>100B) | `--mem-fraction-static 0.85 --watchdog-timeout 1200` | +| MLA models | `--chunked-prefill-size 131072` | +| Models needing trust | `--trust-remote-code` | +| Fast loading | `--model-loader-extra-config '{"enable_multithread_load": true}'` | + +## Step 3: Create Test Files + +Create **two** test files — one for MI30x, one for MI35x. See the `write-amd-nightly-test` skill for detailed templates and patterns. + +### File locations + +- MI30x: `test/registered/amd/accuracy/mi30x/test_{model}_eval_amd.py` +- MI35x: `test/registered/amd/accuracy/mi35x/test_{model}_eval_mi35x.py` + +### Key MI30x vs MI35x differences + +| | MI30x | MI35x | +|---|---|---| +| HF cache | (system default) | `os.environ.setdefault("HF_HOME", "/data2/models/huggingface")` and `os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub")` | +| `est_time` | 3600 | 5400 | +| Suite name prefix | `nightly-amd-accuracy-8-gpu-` | `nightly-amd-8-gpu-mi35x-` or `nightly-amd-accuracy-8-gpu-mi35x-` | +| Class name suffix | `EvalAMD` | `EvalMI35x` | +| Summary header | `(MI325)` | `(MI35x)` | +| `timeout` in ModelConfig | 3600 | 5400 | + +## Step 4: Update CI Workflow YAML + +Edit **both** workflow files. Each requires changes in **three** places. + +### Runners + +| Platform | GPUs | Runner label | +|---|---|---| +| MI30x (MI300X/MI325X) | 1 | `linux-mi325-1gpu-sglang` | +| MI30x | 2 | `linux-mi325-2gpu-sglang` | +| MI30x | 8 | `linux-mi325-8gpu-sglang` | +| MI35x (MI355X) | 1 | `linux-mi35x-gpu-1` | +| MI35x | 8 | `linux-mi35x-gpu-8` | +| MI35x (disagg/RDMA) | 8 | `linux-mi35x-gpu-8.fabric` | + +### File 1: `.github/workflows/nightly-test-amd.yml` + +**Place 1** — Add to `job_select` options list: +```yaml + - nightly-8-gpu-{model} # MI30x + - nightly-8-gpu-mi35x-{model} # MI35x +``` + +**Place 2** — Add job definition block (MI30x template): +```yaml + nightly-8-gpu-{model}: + if: >- + (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') + && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' + || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-{model},')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Accuracy Test (8-GPU {MODEL_DISPLAY}) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite {SUITE_NAME} --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} +``` + +MI35x variant: `runs-on: linux-mi35x-gpu-8`, job name prefix `nightly-8-gpu-mi35x-{model}`. + +**Place 3** — Add to `check-all-jobs.needs` list: +```yaml + - nightly-8-gpu-{model} + - nightly-8-gpu-mi35x-{model} +``` + +### File 2: `.github/workflows/nightly-test-amd-rocm720.yml` + +Same three places, but: +- Job names get `-rocm720` suffix +- Docker setup adds `--rocm-version rocm720` +- Install step adds `--skip-test-time-deps` +- Suite name stays the same (test file is shared) + +## Step 5: Update Documentation + +If the model has a docs page under `docs/basic_usage/`, add an AMD GPU deployment section. If the model is new, add it to `docs/supported_models/text_generation/generative_models.md`. + +## Step 6: Local Validation + +```bash +docker exec -it {CONTAINER} bash +cd /sglang-checkout && pip install -e "python[all]" + +SGLANG_USE_AITER=1 python3 -m sglang.launch_server \ + --model {MODEL_PATH} --tp 8 {OTHER_ARGS} & + +python3 test/registered/amd/accuracy/mi30x/test_{model}_eval_amd.py +``` + +Verify: accuracy meets threshold, no HIP errors, server launches within timeout. + +## Checklist + +- [ ] HuggingFace config analyzed (AttentionArch: MLA or MHA) +- [ ] Backend verified (auto-selection or explicit override) +- [ ] MI30x test file created in `test/registered/amd/accuracy/mi30x/` +- [ ] MI35x test file created in `test/registered/amd/accuracy/mi35x/` +- [ ] `nightly-test-amd.yml` updated (3 places: options, job block, needs) +- [ ] `nightly-test-amd-rocm720.yml` updated (3 places: options, job block, needs) +- [ ] Documentation updated (if applicable) +- [ ] Local validation passed on AMD hardware + +## References + +- `python/sglang/srt/server_args.py` — `ATTENTION_BACKEND_CHOICES`, `_get_default_attn_backend()`, auto-selection logic +- `python/sglang/srt/configs/model_config.py` — `AttentionArch` enum (MLA, MHA) +- `python/sglang/srt/layers/attention/attention_registry.py` — backend name → class mapping +- `test/registered/amd/accuracy/mi30x/test_minimax_m25_eval_amd.py` — standalone MHA test +- `test/registered/amd/accuracy/mi30x/test_deepseek_v32_eval_amd.py` — standalone MLA test +- `test/registered/amd/accuracy/mi30x/test_kimi_k25_eval_amd.py` — shared evaluator test +- `test/registered/amd/accuracy/mi30x/test_glm5_eval_amd.py` — NSA backend test +- `.github/workflows/nightly-test-amd.yml` — CI workflow (ROCm default) +- `.github/workflows/nightly-test-amd-rocm720.yml` — CI workflow (ROCm 7.2) diff --git a/.claude/skills/amd/write-amd-nightly-test/SKILL.md b/.claude/skills/amd/write-amd-nightly-test/SKILL.md new file mode 100644 index 000000000000..a5fbd1df614c --- /dev/null +++ b/.claude/skills/amd/write-amd-nightly-test/SKILL.md @@ -0,0 +1,294 @@ +--- +name: write-amd-nightly-test +description: Write AMD nightly accuracy and performance tests for MI30x and MI35x platforms. Covers GSM8K completion benchmarks, MMMU VLM evaluations, performance benchmarks with NightlyBenchmarkRunner, CI suite registration with register_amd_ci, and cross-platform test variants. Use when creating AMD test files, adding models to AMD nightly CI, or writing accuracy/performance tests for AMD GPUs. +--- + +# Write AMD Nightly Test + +Guide for writing nightly CI tests that run on AMD MI30x (MI300X/MI325X) and MI35x (MI355X) hardware. + +## Test Types and Templates + +| Type | Evaluation | Template file | +|---|---|---| +| Text accuracy (GSM8K standalone) | Inline 5-shot completion benchmark | `test/registered/amd/accuracy/mi30x/test_minimax_m25_eval_amd.py` | +| Text accuracy (shared evaluator) | `sglang.test.few_shot_gsm8k.run_eval` | `test/registered/amd/accuracy/mi30x/test_kimi_k25_eval_amd.py` | +| Text accuracy (LMEvalMixin) | `LMEvalMixin` + `CustomTestCase` | `test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py` | +| VLM accuracy (MMMU) | `run_eval` with `eval_name="mmmu"` | `test/registered/amd/accuracy/mi30x/test_vlms_mmmu_eval_amd.py` | +| Performance | `NightlyBenchmarkRunner` | `test/registered/amd/perf/mi30x/test_deepseek_v32_basic_perf_amd.py` | +| Diffusion | Custom server + generation | `test/registered/amd/test_wan2_2_i2v_a14b.py` | + +## CI Registration + +Every test file **must** call `register_amd_ci` at module level: + +```python +from sglang.test.ci.ci_register import register_amd_ci + +register_amd_ci( + est_time=3600, + suite="nightly-amd-accuracy-8-gpu-{model}", + nightly=True, +) +``` + +The `suite` name must match the CI workflow YAML job that invokes `run_suite.py --suite {suite}`. + +### Suite naming (from actual codebase) + +Suite names are **not strictly uniform** — follow the naming style closest to existing tests of the same type: + +| Pattern | Examples | +|---|---| +| `nightly-amd-accuracy-8-gpu-{model}` | `nightly-amd-accuracy-8-gpu-minimax-m25`, `nightly-amd-accuracy-8-gpu-glm5` | +| `nightly-amd-8-gpu-{feature}` | `nightly-amd-8-gpu-grok`, `nightly-amd-8-gpu-deepseek-v3-kv-fp8` | +| `nightly-amd-8-gpu-mi35x-{model}` | `nightly-amd-8-gpu-mi35x-glm5`, `nightly-amd-8-gpu-mi35x-minimax-m25` | +| `nightly-amd-accuracy-8-gpu-mi35x-{model}` | `nightly-amd-accuracy-8-gpu-mi35x-qwen35`, `nightly-amd-accuracy-8-gpu-mi35x-kimi-k25` | +| `nightly-perf-8-gpu-{model}` | `nightly-perf-8-gpu-deepseek-v32-basic`, `nightly-perf-8-gpu-grok2` | +| `nightly-perf-8-gpu-mi35x-{model}` | `nightly-perf-8-gpu-mi35x-deepseek-v32-basic` | +| `nightly-8-gpu-{model}` | `nightly-8-gpu-qwen3-235b` | +| `nightly-amd` | Shared 2-GPU GSM8K accuracy suite | +| `nightly-amd-accuracy-2-gpu-vlm` | VLM MMMU 2-GPU suite | +| `stage-b-test-small-1-gpu-amd` | Per-PR unit tests on AMD | +| `stage-c-test-large-8-gpu-amd` | Per-PR 8-GPU tests on AMD | + +### Runners + +| Platform | GPUs | Runner label | +|---|---|---| +| MI30x | 1 | `linux-mi325-1gpu-sglang` | +| MI30x | 2 | `linux-mi325-2gpu-sglang` | +| MI30x | 8 | `linux-mi325-8gpu-sglang` | +| MI35x | 1 | `linux-mi35x-gpu-1` | +| MI35x | 8 | `linux-mi35x-gpu-8` | +| MI35x (disagg) | 8 | `linux-mi35x-gpu-8.fabric` | + +## Text Accuracy Test Patterns + +### Pattern A: Standalone GSM8K (inline benchmark, `unittest.TestCase`) + +For models needing custom server args or per-model iteration. Most accuracy tests use this pattern. + +Key structure from `test_minimax_m25_eval_amd.py`: + +```python +from sglang.test.ci.ci_register import register_amd_ci +register_amd_ci(est_time=3600, suite="nightly-amd-accuracy-8-gpu-{model}", nightly=True) + +@dataclass +class ModelConfig: + model_path: str + tp_size: int = 8 + accuracy_threshold: float = 0.50 + other_args: Optional[List[str]] = None + env_vars: Optional[dict] = None + timeout: Optional[int] = None + variant: Optional[str] = None + +MODELS = [ + ModelConfig( + model_path="{ORG}/{MODEL}", + tp_size=8, + accuracy_threshold=0.93, + timeout=3600, + variant="TP8", + other_args=["--attention-backend", "aiter", ...], + env_vars={"SGLANG_USE_AITER": "1"}, + ), +] + +class TestModelEvalAMD(unittest.TestCase): + def test_accuracy(self): + for config in self.models: + process = popen_launch_server(config.model_path, ...) + try: + acc, invalid, latency = run_gsm8k_benchmark(...) + finally: + kill_process_tree(process.pid) +``` + +### Pattern B: Shared evaluator (`CustomTestCase` + `few_shot_gsm8k`) + +For simpler models with a long-lived server. Used by Kimi-K2.5, Kimi-K2, DeepSeek-V3.2 variants. + +```python +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.test_utils import CustomTestCase + +class TestModelEvalAMD(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.process = popen_launch_server(MODEL_PATH, cls.base_url, ...) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_accuracy(self): + args = SimpleNamespace(num_shots=8, num_questions=1319, parallel=1319, ...) + metrics = run_eval_few_shot_gsm8k(args) + self.assertGreaterEqual(metrics["accuracy"], THRESHOLD) +``` + +### Pattern C: LMEvalMixin (`CustomTestCase` + mixin) + +For models that use the `lm-eval` harness. Used by Qwen3.5. + +```python +from sglang.test.test_utils import CustomTestCase, LMEvalMixin + +class TestModelEvalAMD(LMEvalMixin, CustomTestCase): + ... +``` + +## Backend Args by Architecture + +### MHA models (standard attention) + +```python +other_args=[ + "--attention-backend", "aiter", # or omit to use auto-selection + "--trust-remote-code", + "--mem-fraction-static", "0.85", +], +env_vars={"SGLANG_USE_AITER": "1"}, +``` + +### MLA models (DeepSeek-style) + +```python +other_args=[ + "--attention-backend", "aiter", # auto-selects when head_num is 16 or 128 + "--chunked-prefill-size", "131072", + "--trust-remote-code", + "--mem-fraction-static", "0.85", + "--model-loader-extra-config", '{"enable_multithread_load": true}', + "--watchdog-timeout", "1200", +], +env_vars={"SGLANG_USE_AITER": "1"}, +``` + +### MoE models + +Add `--ep-size 8` for models with many experts (e.g., MiniMax-M2.5 with 256 experts). + +### NSA models (GLM-5 style) + +```python +other_args=[ + "--trust-remote-code", + "--nsa-prefill-backend", "tilelang", # default on HIP, can omit + "--nsa-decode-backend", "tilelang", # default on HIP, can omit + "--chunked-prefill-size", "131072", + "--mem-fraction-static", "0.80", + "--model-loader-extra-config", '{"enable_multithread_load": true}', + "--watchdog-timeout", "1200", +], +env_vars={"SGLANG_USE_AITER": "1"}, +``` + +### Mixed prefill/decode (Kimi-K2.5) + +```python +other_args=[ + "--decode-attention-backend", "triton", + "--prefill-attention-backend", "aiter", +], +env_vars={"SGLANG_USE_AITER": "1", "SGLANG_ROCM_FUSED_DECODE_MLA": "0"}, +``` + +## VLM Accuracy Test Pattern + +VLM tests use MMMU evaluation instead of GSM8K: + +```python +args = SimpleNamespace( + base_url=self.base_url, + model=model_path, + eval_name="mmmu", + num_examples=100, + num_threads=64, + max_tokens=30, +) +metrics = run_eval(args) +``` + +VLM tests typically: +- Use TP=1 or TP=2 (smaller models) +- Support retries (up to 3 attempts) +- Track startup/eval/total times in summaries +- Exclude known-failing models via `AMD_FAILING_VLM_MODELS` list + +## Performance Test Pattern + +Performance tests use `NightlyBenchmarkRunner`: + +```python +from sglang.test.nightly_utils import NightlyBenchmarkRunner + +runner = NightlyBenchmarkRunner(model_path, variant_config, ...) +runner.run_benchmark_for_model(batch_sizes=[1, 32], input_lens=[...], output_lens=[...]) +``` + +## MI30x vs MI35x Differences + +When creating a MI35x variant, apply these changes: + +```python +# Add at TOP of MI35x file (before other imports) +import os +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") +``` + +| Aspect | MI30x | MI35x | +|---|---|---| +| Suite name | `nightly-amd-accuracy-8-gpu-{model}` | `nightly-amd-8-gpu-mi35x-{model}` | +| `est_time` | 3600 | 5400 | +| Class name | `Test{Model}EvalAMD` | `Test{Model}EvalMI35x` | +| `timeout` | 3600 | 5400 | + +## GitHub Step Summary + +```python +from sglang.test.test_utils import is_in_ci, write_github_step_summary + +if is_in_ci(): + summary = "### {Model} ({Platform})\n\n" + summary += "| Model | TP | Accuracy | Threshold | Status |\n" + summary += "| ----- | -- | -------- | --------- | ------ |\n" + summary += f"| {path} | {tp} | {acc:.3f} | {threshold} | {status} |\n" + write_github_step_summary(summary) +``` + +## Accuracy Thresholds + +Always measure on real AMD hardware first. Set threshold ~2-3% below measured accuracy. + +## Checklist + +- [ ] Test class inherits from `unittest.TestCase` or `CustomTestCase` +- [ ] `register_amd_ci(...)` called at module level +- [ ] Suite name matches CI workflow YAML job +- [ ] `SGLANG_USE_AITER=1` set in env vars +- [ ] Server killed in teardown via `kill_process_tree` +- [ ] GitHub step summary generated +- [ ] Has `if __name__ == "__main__": unittest.main()` +- [ ] MI35x variant created with HF cache env, adjusted est_time/timeout, updated suite name +- [ ] Accuracy threshold validated on real hardware + +## References + +- `.claude/skills/write-sglang-test/SKILL.md` — general SGLang test writing guide +- `.claude/skills/amd/enable-amd-model/SKILL.md` — full enablement workflow including CI YAML +- `python/sglang/srt/server_args.py` — `ATTENTION_BACKEND_CHOICES`, `NSA_CHOICES`, auto-selection +- `python/sglang/test/few_shot_gsm8k.py` — shared GSM8K evaluator +- `python/sglang/test/test_utils.py` — `CustomTestCase`, `LMEvalMixin`, `popen_launch_server` +- `python/sglang/test/nightly_utils.py` — `NightlyBenchmarkRunner` for perf tests +- `test/registered/amd/accuracy/mi30x/test_minimax_m25_eval_amd.py` — standalone MHA template +- `test/registered/amd/accuracy/mi30x/test_deepseek_v32_eval_amd.py` — standalone MLA template +- `test/registered/amd/accuracy/mi30x/test_kimi_k25_eval_amd.py` — shared evaluator template +- `test/registered/amd/accuracy/mi30x/test_glm5_eval_amd.py` — NSA backend template +- `test/registered/amd/accuracy/mi30x/test_qwen35_eval_amd.py` — LMEvalMixin template +- `test/registered/amd/accuracy/mi30x/test_vlms_mmmu_eval_amd.py` — VLM template +- `test/registered/amd/perf/mi30x/test_deepseek_v32_basic_perf_amd.py` — perf template diff --git a/.claude/skills/sglang-bisect-ci-regression/SKILL.md b/.claude/skills/sglang-bisect-ci-regression/SKILL.md new file mode 100644 index 000000000000..4eb39227c16e --- /dev/null +++ b/.claude/skills/sglang-bisect-ci-regression/SKILL.md @@ -0,0 +1,219 @@ +# SGLang Bisect CI Regression + +Investigate a consistently failing CI test to find the root cause - whether it's a code regression from a specific PR, a hardware/runner-specific issue, or an environment change. Optionally reproduce the failure on a remote GPU server. + +## Slash Command + +`/sglang-bisect-ci-regression [ssh_target] [docker_container]` + +## When to Use This Skill + +- A CI test is failing consistently on main (scheduled runs) +- You need to find which PR introduced a regression +- You suspect a runner-specific or GPU-specific issue +- You want to reproduce a CI failure on a remote server + +## Arguments + +- **First argument (required)**: Test file name (e.g. `test_lora_tp.py`) or a GitHub Actions job URL +- **Second argument (optional)**: SSH target for remote reproduction (e.g. `user@host`) +- **Third argument (optional)**: Docker container name on the SSH target (e.g. `sglang_dev`) + +If SSH target and docker container are not provided, the skill will only perform the CI log analysis and bisection, without remote reproduction. **Ask the user** for these if reproduction is needed and they weren't provided. + +## Background: Scheduled CI Runs + +SGLang uses the `pr-test.yml` workflow with **scheduled runs** (cron-triggered) to periodically test the `main` branch. These runs are the primary data source for detecting regressions: + +- **Workflow**: `pr-test.yml` with `event: schedule` +- **Branch**: `main` +- **Dashboard**: https://github.com/sgl-project/sglang/actions/workflows/pr-test.yml?query=event%3Aschedule +- **Frequency**: Runs multiple times daily, each pinned to the HEAD of `main` at trigger time +- **Purpose**: Catches regressions that slip through PR-level CI (e.g., interaction bugs between merged PRs, hardware-specific issues) + +Always use these scheduled runs (not PR-triggered runs) when bisecting regressions on `main`. The `--event schedule` filter in `gh run list` ensures you only see these periodic main-branch runs. + +## Workflow + +### Phase 1: Extract the Failure Signature + +1. **Get the failing test details from CI logs.** If given a URL, fetch logs directly. If given a test name, find recent scheduled runs of `pr-test.yml` on `main` that failed: + +```bash +# List recent scheduled runs targeting main (the primary source of truth for regressions) +# These are cron-triggered runs visible at: +# https://github.com/sgl-project/sglang/actions/workflows/pr-test.yml?query=event%3Aschedule +gh run list --repo sgl-project/sglang --workflow="pr-test.yml" --event schedule --branch main --limit 20 --json databaseId,conclusion,createdAt,headSha + +# Find the job containing the test +gh run view {RUN_ID} --repo sgl-project/sglang --json jobs --jq '.jobs[] | select(.conclusion == "failure") | {name, conclusion, databaseId}' + +# Get the failure details +gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -E -B 5 -A 30 "AssertionError|FAIL|Error|{TEST_NAME}" +``` + +2. **Record the failure signature:** + - Exact error message and assertion + - Affected test method name + - Model/config involved + - Numeric values (e.g., tolerance diffs, scores) + - Whether the failure is deterministic (same values across runs) + +### Phase 2: Temporal Bisection + +3. **Find the boundary between passing and failing runs.** Walk through the scheduled run history (from the `pr-test.yml` schedule runs on `main`) to identify: + - Last known PASSING run (sha + date) + - First known FAILING run (sha + date) + +```bash +# For each scheduled run, check the specific partition/job status +gh run view {RUN_ID} --repo sgl-project/sglang --json jobs --jq '.jobs[] | select(.name == "{JOB_NAME}") | {conclusion, databaseId}' + +# Verify a specific test passed or failed in a run +gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -E "{TEST_NAME}|PASSED|FAILED|logprobs mismatch" | head -10 +``` + +4. **List commits between the boundary:** + +```bash +git log --oneline {LAST_PASS_SHA}..{FIRST_FAIL_SHA} +``` + +5. **Filter for relevant commits** that touch files related to the failing test (model layers, kernels, test utilities, etc.): + +```bash +git log --oneline {LAST_PASS_SHA}..{FIRST_FAIL_SHA} -- {relevant_paths} +``` + +### Phase 3: Runner/Hardware Analysis + +6. **Check if the failure is runner-specific.** Extract the runner identity from each failing and passing run: + +```bash +# Get runner name and machine +gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -E "Runner name|Machine name" | head -5 + +# Get GPU/driver info +gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -i -E "NVIDIA-SMI|Driver Version|CUDA Version" | head -5 + +# Get package versions +gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -E "sgl.kernel.*==|flashinfer.*==" | head -5 +``` + +7. **Correlate runners with pass/fail outcomes.** Build a table: + +| Run ID | Date | Runner | GPU Type | Driver | Result | +|--------|------|--------|----------|--------|--------| + +If all failures map to a specific runner type/GPU and all passes map to another, the issue is **hardware-specific**, not a code regression. + +### Phase 4: Code Analysis + +8. **If a code regression is suspected** (failures not runner-specific), examine the candidate commits: + - Read the changed files + - Understand how the changes could affect the failing test + - Look for prefill-vs-decode differences, TP-specific paths, kernel changes + +9. **If a hardware issue is suspected**, analyze: + - Kernel compatibility (CUDA compute capability) + - Driver version differences + - All-reduce / NCCL behavior differences + - CUDA graph capture differences across GPU architectures + +### Phase 5: Remote Reproduction (Optional) + +Only if SSH target and docker container were provided. + +10. **Verify the remote environment:** + +```bash +ssh {SSH_TARGET} "docker exec {CONTAINER} nvidia-smi --query-gpu=name,driver_version --format=csv" +ssh {SSH_TARGET} "docker exec {CONTAINER} pip show sgl-kernel sglang flashinfer-python 2>&1 | grep -E 'Name:|Version:'" +``` + +11. **Ensure latest code is installed.** If the container is stale, update: + +```bash +# Try fetching latest main +ssh {SSH_TARGET} "docker exec {CONTAINER} bash -c 'cd /path/to/sglang && git fetch origin main && git checkout origin/main'" +# Or download and install from tarball if git auth fails +ssh {SSH_TARGET} "docker exec {CONTAINER} bash -c 'cd /tmp && curl -L https://github.com/sgl-project/sglang/archive/refs/heads/main.tar.gz | tar xz && cd sglang-main && pip install -e \"python[all]\"'" +# Reinstall (after git fetch) +ssh {SSH_TARGET} "docker exec {CONTAINER} bash -c 'cd /path/to/sglang && pip install -e \"python[all]\"'" +# Install test dependencies if needed +ssh {SSH_TARGET} "docker exec {CONTAINER} pip install peft rouge-score" +``` + +12. **Create a minimal reproduction script** that: + - Uses `if __name__ == '__main__'` with `mp.set_start_method("spawn")` + - Runs the specific failing test configuration + - Prints key metrics (diffs, scores, outputs) + - Exits with code 1 on failure + +13. **Copy and run the reproduction script:** + +```bash +scp /tmp/repro_script.py {SSH_TARGET}:/tmp/ +ssh {SSH_TARGET} "docker cp /tmp/repro_script.py {CONTAINER}:/tmp/" +ssh {SSH_TARGET} "docker exec -e CUDA_VISIBLE_DEVICES=0,1 {CONTAINER} python3 /tmp/repro_script.py" +``` + +14. **Run control experiments** to isolate the variable: + - If suspecting TP issue: run with TP=1 as control + - If suspecting GPU issue: compare same code on different GPU + - If suspecting a specific commit: test before/after that commit + +### Phase 6: Report + +15. **Produce a structured report:** + +```markdown +## CI Regression Bisection Report + +### Failure Signature +- **Test**: {test_file}::{test_method} +- **Error**: {exact error message} +- **Key metrics**: {numeric values} +- **Deterministic**: Yes/No + +### Root Cause Classification +One of: +- **Code Regression**: PR #{number} introduced the bug +- **Hardware-Specific**: Fails on {GPU_TYPE}, passes on others +- **Environment Change**: New runner/driver/package version +- **Pre-existing Flakiness**: Intermittent, not a new regression + +### Evidence +| Condition | Result | +|-----------|--------| +| {condition1} | PASS/FAIL | +| {condition2} | PASS/FAIL | + +### Timeline +- {date}: Last known pass ({sha}, {runner}) +- {date}: First known fail ({sha}, {runner}) +- {date}: Confirmed reproduction on {server} + +### Recommended Fix +- **Short-term**: {workaround} +- **Long-term**: {proper fix} +``` + +## Key Patterns to Recognize + +| Pattern | Diagnosis | +|---------|-----------| +| Same SHA passes on runner A, fails on runner B | Hardware/runner-specific | +| All runners fail after commit X | Code regression from commit X | +| Intermittent - same runner sometimes passes/fails | Flaky test or race condition | +| Prefill OK but decode fails | TP/all-reduce issue in decode path | +| Works with TP=1, fails with TP>1 | Tensor parallelism bug | +| Exact same numeric diff every time | Deterministic bug, not flakiness | + +## Important Notes + +- **Always check runner identity** before concluding it's a code regression. Many "consistent" failures are actually runner-specific. +- **Test partition assignments change over time** as tests are added/removed. A test may move between partitions, landing on different runner types. +- **H200 runners** use `/root/actions-runner/` path and machine names like `gpu-h200-worker-*`. Non-H200 runners use `/public_sglang_ci/runner-*` paths. +- When running remote reproduction, use `run_in_background` for long-running tests and check output with `TaskOutput`. +- Container environments may be stale - always verify package versions match CI before drawing conclusions. diff --git a/.claude/skills/write-sglang-test/SKILL.md b/.claude/skills/write-sglang-test/SKILL.md new file mode 100644 index 000000000000..030b451a988a --- /dev/null +++ b/.claude/skills/write-sglang-test/SKILL.md @@ -0,0 +1,248 @@ +--- +name: write-sglang-test +description: Guide for writing SGLang CI/UT tests following project conventions. Covers CustomTestCase, CI registration, server fixtures, model selection, and test placement. Use when creating new tests, adding CI test cases, writing unit tests, or when the user asks to add tests for SGLang features. +--- + +# Writing SGLang CI / UT Tests + +## Core Rules + +1. **Always use `CustomTestCase`** — never raw `unittest.TestCase` +2. **Place tests in `test/registered//`** — only use `test/manual/` for debugging / non-CI tests +3. **Reuse server fixtures** — inherit from `DefaultServerBase` or write `setUpClass`/`tearDownClass` with `popen_launch_server` +4. **Smallest model for model-agnostic functionality** — use `DEFAULT_SMALL_MODEL_NAME_FOR_TEST` (Llama-3.2-1B-Instruct) for basic features that don't depend on model size +5. **8B for general performance** — use `DEFAULT_MODEL_NAME_FOR_TEST` (Llama-3.1-8B-Instruct, single-node) for performance tests that don't involve spec / DP / parallelism +6. **Bigger features → discuss case by case** — spec, DP attention, tensor/pipeline parallelism etc. may need multi-GPU suites and specific models + +--- + +## Test File Template + +### Functional correctness test (small model) + +```python +import unittest + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_SMALL_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +register_cuda_ci(est_time=60, suite="stage-b-test-small-1-gpu") + + +class TestMyFeature(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--arg1", "value1"], # feature-specific args + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_basic_functionality(self): + response = requests.post( + self.base_url + "/generate", + json={"text": "Hello", "sampling_params": {"max_new_tokens": 32}}, + ) + self.assertEqual(response.status_code, 200) + + +if __name__ == "__main__": + unittest.main(verbosity=3) +``` + +### General performance test (8B model, single node, no spec/DP/parallelism) + +```python +import time +import unittest + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu") + + +class TestMyFeaturePerf(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_latency(self): + start = time.perf_counter() + response = requests.post( + self.base_url + "/generate", + json={"text": "Hello", "sampling_params": {"max_new_tokens": 128}}, + ) + elapsed = time.perf_counter() - start + self.assertEqual(response.status_code, 200) + self.assertLess(elapsed, 5.0, "Latency exceeded threshold") + + +if __name__ == "__main__": + unittest.main(verbosity=3) +``` + +--- + +## Server Fixture Reuse + +For tests that only need a standard server, inherit from `DefaultServerBase` and override class attributes: + +```python +from sglang.test.server_fixtures.default_fixture import DefaultServerBase + +class TestMyFeature(DefaultServerBase): + model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST + other_args = ["--enable-my-feature"] + + def test_something(self): + ... +``` + +Available fixtures in `python/sglang/test/server_fixtures/`: + +| Fixture | Use case | +|---------|----------| +| `DefaultServerBase` | Standard single-server tests | +| `EagleServerBase` | EAGLE speculative decoding | +| `PDDisaggregationServerBase` | Disaggregated prefill/decode | +| `MMMUServerBase` | Multimodal VLM tests | + +--- + +## CI Registration + +Every test file in `test/registered/` **must** call a registration function at module level: + +```python +from sglang.test.ci.ci_register import register_cuda_ci, register_amd_ci + +register_cuda_ci(est_time=60, suite="stage-b-test-small-1-gpu") +register_amd_ci(est_time=60, suite="stage-b-test-small-1-gpu-amd") # optional +``` + +Parameters: +- `est_time`: estimated runtime in seconds (used for CI partitioning) +- `suite`: which CI suite to run in (see below) +- `nightly=True`: for nightly-only tests (default `False` = per-commit) +- `disabled="reason"`: temporarily disable with explanation + +### Suite selection guide + +**Default cases (1 GPU):** + +| Scenario | Model | Suite | +|----------|-------|-------| +| Model-agnostic basic functionality | 1B (smallest) | `stage-b-test-small-1-gpu` | +| General performance (no spec/DP/parallelism) | 8B | `stage-b-test-large-1-gpu` | + +**Bigger features (case by case):** + +| Scenario | Suite | +|----------|-------| +| 2 GPU (e.g. TP=2) | `stage-b-test-large-2-gpu` | +| 4 GPU (H100) | `stage-c-test-4-gpu-h100` | +| 8 GPU (H200) | `stage-c-test-8-gpu-h200` | +| Nightly, 1 GPU | `nightly-1-gpu` | +| Nightly, 8 GPU | `nightly-8-gpu` | + +For spec, DP attention, parallelism, disaggregation, etc., discuss with the team to determine the appropriate suite and GPU configuration. + +--- + +## Model Constants + +All defined in `python/sglang/test/test_utils.py`: + +| Constant | Model | When to use | +|----------|-------|-------------| +| `DEFAULT_SMALL_MODEL_NAME_FOR_TEST` | Llama-3.2-1B-Instruct | Model-agnostic basic functionality | +| `DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE` | Llama-3.2-1B | Base (non-instruct) model tests | +| `DEFAULT_MODEL_NAME_FOR_TEST` | Llama-3.1-8B-Instruct | General performance (single node) | +| `DEFAULT_MOE_MODEL_NAME_FOR_TEST` | Mixtral-8x7B-Instruct | MoE-specific tests | +| `DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST` | — | Embedding tests | +| `DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST` | — | Vision-language tests | + +--- + +## Test Placement + +``` +test/ +├── registered/ # CI tests (auto-discovered by run_suite.py) +│ ├── sampling/ # test_penalty.py, test_sampling_params.py ... +│ ├── sessions/ # test_session_control.py ... +│ ├── openai_server/ # basic/, features/, validation/ ... +│ ├── spec/ # eagle/, utils/ ... +│ ├── models/ # model-specific accuracy tests +│ ├── perf/ # performance benchmarks +│ └── / # create new category if needed +├── manual/ # Non-CI: debugging, one-off, manual verification +└── run_suite.py # CI runner (scans registered/ only) +``` + +**Decision rule**: if the test should run in CI → `registered/`. If it's for local debugging or requires special hardware not in CI → `manual/`. + +--- + +## Key Utilities + +```python +from sglang.test.test_utils import ( + CustomTestCase, # base class with retry logic + popen_launch_server, # launch server subprocess + DEFAULT_URL_FOR_TEST, # auto-configured base URL + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, # 600s default + run_bench_serving, # benchmark helper (launch + bench) +) +from sglang.srt.utils import kill_process_tree # cleanup server +``` + +--- + +## Checklist + +Before submitting a test: + +- [ ] Inherits from `CustomTestCase` (not `unittest.TestCase`) +- [ ] Has `register_*_ci(...)` call at module level +- [ ] Placed in `test/registered//` +- [ ] Model selection: smallest for model-agnostic features, 8B for general perf, case-by-case for other complex features +- [ ] `setUpClass` launches server, `tearDownClass` kills it +- [ ] Has `if __name__ == "__main__": unittest.main(verbosity=3)` +- [ ] `est_time` is reasonable (measure locally) diff --git a/.github/CI_PERMISSIONS.json b/.github/CI_PERMISSIONS.json index c661d147f028..13f95d5d577b 100644 --- a/.github/CI_PERMISSIONS.json +++ b/.github/CI_PERMISSIONS.json @@ -55,6 +55,13 @@ "reason": "top contributor", "can_rerun_stage": true }, + "Chen-0210": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 60, + "reason": "custom override", + "can_rerun_stage": true + }, "ClawSeven": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, @@ -121,7 +128,7 @@ "HandH1998": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, - "cooldown_interval_minutes": 60, + "cooldown_interval_minutes": 0, "reason": "custom override", "can_rerun_stage": true }, @@ -188,6 +195,13 @@ "reason": "custom override", "can_rerun_stage": true }, + "Prozac614": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 60, + "reason": "custom override", + "can_rerun_stage": true + }, "Qiaolin-Yu": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, @@ -202,6 +216,20 @@ "reason": "custom override", "can_rerun_stage": true }, + "Ratish1": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 0, + "reason": "custom override", + "can_rerun_stage": true + }, + "RubiaCx": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 60, + "reason": "custom override", + "can_rerun_stage": true + }, "ShangmingCai": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, @@ -209,6 +237,13 @@ "reason": "top contributor", "can_rerun_stage": true }, + "Shunkangz": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 60, + "reason": "custom override", + "can_rerun_stage": true + }, "SimonCqk": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, @@ -412,6 +447,13 @@ "reason": "custom override", "can_rerun_stage": true }, + "dongjiyingdjy": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 60, + "reason": "custom override", + "can_rerun_stage": true + }, "dougyster": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, @@ -636,6 +678,13 @@ "reason": "top contributor", "can_rerun_stage": true }, + "kpham-sgl": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 0, + "reason": "custom override", + "can_rerun_stage": true + }, "kssteven418": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, @@ -706,6 +755,20 @@ "reason": "custom override", "can_rerun_stage": true }, + "mmangkad": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 0, + "reason": "custom override", + "can_rerun_stage": true + }, + "narutolhy": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 0, + "reason": "custom override", + "can_rerun_stage": true + }, "netanel-haber": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, @@ -811,6 +874,13 @@ "reason": "top contributor", "can_rerun_stage": true }, + "samuellees": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 60, + "reason": "custom override", + "can_rerun_stage": true + }, "scottjlee": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, @@ -825,6 +895,13 @@ "reason": "top contributor", "can_rerun_stage": true }, + "sglang-npu-bot": { + "can_tag_run_ci_label": true, + "can_rerun_failed_ci": true, + "cooldown_interval_minutes": 0, + "reason": "custom override", + "can_rerun_stage": true + }, "shaharmor98": { "can_tag_run_ci_label": true, "can_rerun_failed_ci": true, diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8b757ead9232..1717f1f80db8 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,18 +1,20 @@ -.github @merrymercy @Fridge003 @ispobock @Kangyan-Zhou -/docker @Fridge003 @ispobock @HaiShaw @ishandhanani +.github @merrymercy @Fridge003 @ispobock @Kangyan-Zhou @bingxche +/docker @Fridge003 @ispobock @HaiShaw @ishandhanani @yctseng0211 /docker/npu.Dockerfile @ping1jing2 @iforgetmyname /python/pyproject.toml @merrymercy @Fridge003 @ispobock -/python/sglang/jit_kernel @DarkSharpness @BBuf +/python/sglang/jit_kernel @DarkSharpness @BBuf @celve @HydraQYH @yuan-luo /python/sglang/jit_kernel/diffusion @yingluosanqian @BBuf @mickqian -/python/sglang/multimodal_gen @mickqian @yhyang201 -/python/sglang/multimodal_gen/runtime/layers @mickqian @yhyang201 @BBuf @yingluosanqian -/python/sglang/multimodal_gen/runtime/models/dits @mickqian @yhyang201 @BBuf @yingluosanqian +/python/sglang/multimodal_gen @mickqian @yhyang201 @ping1jing2 +/python/sglang/multimodal_gen/runtime/cache @DefTruth +/python/sglang/multimodal_gen/runtime/layers @mickqian @yhyang201 @BBuf @yingluosanqian @ping1jing2 +/python/sglang/multimodal_gen/runtime/models/dits @mickqian @yhyang201 @BBuf @yingluosanqian @ping1jing2 /python/sglang/srt/batch_invariant_ops @Fridge003 @hebiao064 /python/sglang/srt/constrained @hnyls2002 @DarkSharpness /python/sglang/srt/compilation @hebiao064 /python/sglang/srt/disaggregation @ByronHsu @hnyls2002 @ShangmingCai /python/sglang/srt/disaggregation/ascend @ping1jing2 @iforgetmyname /python/sglang/srt/distributed @yizhang2077 @merrymercy @ch-wan +/python/sglang/srt/dllm @ClawSeven @btw616 /python/sglang/srt/entrypoints @ispobock @CatherineSue @slin1237 @merrymercy @JustinTong0323 /python/sglang/srt/entrypoints/grpc_server.py @CatherineSue @slin1237 /python/sglang/srt/eplb @fzyzcjy @ch-wan @@ -21,11 +23,13 @@ /python/sglang/srt/hardware_backend/npu @ping1jing2 @iforgetmyname /python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname /python/sglang/srt/layers @merrymercy @Ying1123 @Fridge003 @ispobock @HaiShaw @ch-wan @BBuf @Edwardf0t1 -/python/sglang/srt/layers/attention @merrymercy @Fridge003 @ispobock @Qiaolin-Yu @hebiao064 +/python/sglang/srt/layers/attention @merrymercy @Fridge003 @ispobock @Qiaolin-Yu @hebiao064 @HaiShaw /python/sglang/srt/layers/attention/fla @yizhang2077 @hebiao064 /python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @yizhang2077 @hebiao064 @hanming-lu /python/sglang/srt/layers/attention/mamba @yizhang2077 @hebiao064 -/python/sglang/srt/layers/quantization @ch-wan @BBuf @Edwardf0t1 @FlamingoPg @AniZpZ +/python/sglang/srt/layers/attention/nsa @1am9trash @hubertlu-tw @kkHuang-amd @HaiShaw @Fridge003 @hlu1 @rainj-me +/python/sglang/srt/layers/quantization @ch-wan @BBuf @Edwardf0t1 @FlamingoPg @AniZpZ @HaiShaw +/python/sglang/srt/layers/quantization/quark @kkHuang-amd @yichiche @hubertlu-tw @1am9trash @BowenBao /python/sglang/srt/lora @Ying1123 @Fridge003 @lifuhuang /python/sglang/srt/managers @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann /python/sglang/srt/managers/scheduler_pp_mixin.py @ShangmingCai @XucSh @@ -34,6 +38,7 @@ /python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py @hebiao064 /python/sglang/srt/models/deepseek_v2.py @fzyzcjy @zhyncs @ispobock @ch-wan @merrymercy @Fridge003 /python/sglang/srt/multimodal @mickqian @JustinTong0323 @yhyang201 @yuan-luo +/python/sglang/srt/observability @merrymercy @fzyzcjy @sufeng-buaa /python/sglang/srt/speculative @Ying1123 @merrymercy @hnyls2002 /sgl-kernel @zhyncs @ispobock @BBuf @yizhang2077 @merrymercy @FlamingoPg @HaiShaw /sgl-model-gateway @slin1237 @CatherineSue diff --git a/.github/actions/upload-cuda-coredumps/action.yml b/.github/actions/upload-cuda-coredumps/action.yml new file mode 100644 index 000000000000..0e9fdde2799d --- /dev/null +++ b/.github/actions/upload-cuda-coredumps/action.yml @@ -0,0 +1,27 @@ +name: Upload CUDA Coredumps +description: Upload CUDA coredump files as artifacts and clean up the directory. + +inputs: + artifact-suffix: + description: Suffix appended to the artifact name (e.g. matrix partition id) + required: false + default: "" + retention-days: + description: Number of days to retain the artifact + required: false + default: "7" + +runs: + using: composite + steps: + - name: Upload CUDA coredumps + uses: actions/upload-artifact@v4 + with: + name: cuda-coredumps-${{ github.job }}${{ inputs.artifact-suffix && format('-{0}', inputs.artifact-suffix) }} + path: ${{ env.SGLANG_CUDA_COREDUMP_DIR || '/tmp/sglang_cuda_coredumps' }}/ + retention-days: ${{ inputs.retention-days }} + if-no-files-found: ignore + + - name: Cleanup CUDA coredumps + shell: bash + run: rm -rf "${{ env.SGLANG_CUDA_COREDUMP_DIR || '/tmp/sglang_cuda_coredumps' }}" diff --git a/.github/workflows/amd-aiter-scout.yml b/.github/workflows/amd-aiter-scout.yml new file mode 100644 index 000000000000..9e7b413bc57d --- /dev/null +++ b/.github/workflows/amd-aiter-scout.yml @@ -0,0 +1,161 @@ +name: AMD AITER Scout + +on: + schedule: + - cron: '0 20 * * 1' # Monday 20:00 UTC + - cron: '0 20 * * 4' # Thursday 20:00 UTC + workflow_dispatch: + inputs: + aiter_ref: + description: 'AITER git ref (branch, tag, or SHA). Default: main (latest commit)' + required: false + type: string + default: 'main' + job_filter: + description: 'Comma-separated workflows to run: nightly-amd, nightly-amd-rocm720, pr-test-amd, pr-test-amd-rocm720. Default: all' + required: false + type: string + default: 'all' + continue_on_error: + description: 'Continue running other workflows even if one fails' + required: false + type: boolean + default: true + +concurrency: + group: amd-aiter-scout-${{ github.run_id }} + cancel-in-progress: true + +jobs: + resolve-aiter: + runs-on: ubuntu-latest + outputs: + aiter_sha: ${{ steps.resolve.outputs.sha }} + run_nightly_amd: ${{ steps.parse.outputs.run_nightly_amd }} + run_nightly_amd_rocm720: ${{ steps.parse.outputs.run_nightly_amd_rocm720 }} + run_pr_test_amd: ${{ steps.parse.outputs.run_pr_test_amd }} + run_pr_test_amd_rocm720: ${{ steps.parse.outputs.run_pr_test_amd_rocm720 }} + steps: + - name: Resolve AITER commit + id: resolve + run: | + REF="${{ inputs.aiter_ref || 'main' }}" + echo "Resolving AITER ref: ${REF}" + + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "refs/heads/${REF}" | head -1 | cut -f1) + if [ -z "$SHA" ]; then + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "refs/tags/${REF}" | head -1 | cut -f1) + fi + if [ -z "$SHA" ]; then + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "${REF}" | head -1 | cut -f1) + fi + if [ -z "$SHA" ]; then + SHA="${REF}" + fi + + echo "sha=${SHA}" >> $GITHUB_OUTPUT + echo "### AITER Ref Resolution" >> $GITHUB_STEP_SUMMARY + echo "- **Requested ref:** \`${REF}\`" >> $GITHUB_STEP_SUMMARY + echo "- **Resolved SHA:** \`${SHA}\`" >> $GITHUB_STEP_SUMMARY + echo "- **AITER commit:** https://github.com/ROCm/aiter/commit/${SHA}" >> $GITHUB_STEP_SUMMARY + + - name: Parse job filter + id: parse + run: | + FILTER="${{ inputs.job_filter || 'all' }}" + echo "Job filter: ${FILTER}" + + if [[ "$FILTER" == "all" ]]; then + echo "run_nightly_amd=true" >> $GITHUB_OUTPUT + echo "run_nightly_amd_rocm720=true" >> $GITHUB_OUTPUT + echo "run_pr_test_amd=true" >> $GITHUB_OUTPUT + echo "run_pr_test_amd_rocm720=true" >> $GITHUB_OUTPUT + else + # Wrap with commas for exact substring matching (avoids "nightly-amd" matching "nightly-amd-rocm720") + PADDED=",${FILTER// /}," + echo "run_nightly_amd=$(echo "$PADDED" | grep -q ',nightly-amd,' && echo true || echo false)" >> $GITHUB_OUTPUT + echo "run_nightly_amd_rocm720=$(echo "$PADDED" | grep -q ',nightly-amd-rocm720,' && echo true || echo false)" >> $GITHUB_OUTPUT + echo "run_pr_test_amd=$(echo "$PADDED" | grep -q ',pr-test-amd,' && echo true || echo false)" >> $GITHUB_OUTPUT + echo "run_pr_test_amd_rocm720=$(echo "$PADDED" | grep -q ',pr-test-amd-rocm720,' && echo true || echo false)" >> $GITHUB_OUTPUT + fi + + echo "### Job Filter" >> $GITHUB_STEP_SUMMARY + echo "- **Filter:** \`${FILTER}\`" >> $GITHUB_STEP_SUMMARY + + call-nightly-amd: + if: needs.resolve-aiter.outputs.run_nightly_amd == 'true' + needs: resolve-aiter + uses: ./.github/workflows/nightly-test-amd.yml + secrets: inherit + with: + ref: ${{ github.sha }} + aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }} + job_filter: 'all' + continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }} + + call-nightly-amd-rocm720: + if: needs.resolve-aiter.outputs.run_nightly_amd_rocm720 == 'true' + needs: resolve-aiter + uses: ./.github/workflows/nightly-test-amd-rocm720.yml + secrets: inherit + with: + ref: ${{ github.sha }} + aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }} + job_filter: 'all' + continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }} + + call-pr-test-amd: + if: needs.resolve-aiter.outputs.run_pr_test_amd == 'true' + needs: resolve-aiter + uses: ./.github/workflows/pr-test-amd.yml + secrets: inherit + with: + run_all_tests: true + aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }} + continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }} + + call-pr-test-amd-rocm720: + if: needs.resolve-aiter.outputs.run_pr_test_amd_rocm720 == 'true' + needs: resolve-aiter + uses: ./.github/workflows/pr-test-amd-rocm720.yml + secrets: inherit + with: + run_all_tests: true + aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }} + continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }} + + check-all-jobs: + if: always() + needs: + - resolve-aiter + - call-nightly-amd + - call-nightly-amd-rocm720 + - call-pr-test-amd + - call-pr-test-amd-rocm720 + runs-on: ubuntu-latest + steps: + - name: Summary + run: | + echo "## AMD AITER Scout Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **AITER SHA:** \`${{ needs.resolve-aiter.outputs.aiter_sha }}\`" >> $GITHUB_STEP_SUMMARY + echo "- **AITER commit:** https://github.com/ROCm/aiter/commit/${{ needs.resolve-aiter.outputs.aiter_sha }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Workflow | Result |" >> $GITHUB_STEP_SUMMARY + echo "|----------|--------|" >> $GITHUB_STEP_SUMMARY + echo "| Nightly AMD (AITER Latest) | \`${{ needs.call-nightly-amd.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| Nightly AMD ROCm 7.2 | \`${{ needs.call-nightly-amd-rocm720.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| PR Test AMD (AITER Latest) | \`${{ needs.call-pr-test-amd.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| PR Test AMD ROCm 7.2 | \`${{ needs.call-pr-test-amd-rocm720.result }}\` |" >> $GITHUB_STEP_SUMMARY + + - name: Check if any job failed + run: | + if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then + echo "One or more workflows failed" + exit 1 + fi + if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then + echo "One or more workflows were cancelled" + exit 1 + fi + echo "All workflows passed" diff --git a/.github/workflows/amd-ci-job-monitor.yml b/.github/workflows/amd-ci-job-monitor.yml new file mode 100644 index 000000000000..1d89deb9ae9b --- /dev/null +++ b/.github/workflows/amd-ci-job-monitor.yml @@ -0,0 +1,149 @@ +name: AMD CI Job Monitor + +on: + schedule: + - cron: '0 0 * * *' # Daily at midnight UTC + pull_request: + paths: + - '.github/workflows/amd-ci-job-monitor.yml' + - 'scripts/ci/query_job_status.py' + workflow_dispatch: + inputs: + hours: + description: 'Time window in hours' + required: false + default: '24' + type: string + job_filter: + description: 'Job name filter (leave empty for all AMD jobs)' + required: false + type: string + +jobs: + # Single job filter mode + custom-report: + name: Custom Job Report + if: ${{ inputs.job_filter }} + runs-on: ubuntu-latest + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: pip install tabulate + + - name: Generate Custom Job Report + timeout-minutes: 30 + run: | + python scripts/ci/query_job_status.py \ + --repo ${{ github.repository }} \ + --job "${{ inputs.job_filter }}" \ + --workflow "pr-test-amd.yml" \ + --hours ${{ inputs.hours || '24' }} \ + --summary + + # Parse workflow files to get job names dynamically + parse-workflows: + name: Parse Workflow Jobs + if: ${{ !inputs.job_filter }} + runs-on: ubuntu-latest + outputs: + pr_jobs: ${{ steps.parse.outputs.pr_jobs }} + nightly_jobs: ${{ steps.parse.outputs.nightly_jobs }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Parse workflow files + id: parse + run: | + # Parse pr-test-amd.yml and extract job names (exclude utility jobs) + # Excluded: call-gate, check-changes, pr-test-amd-finish, cancel, check-all-jobs + pr_jobs=$(yq -r '.jobs | keys | .[]' .github/workflows/pr-test-amd.yml | \ + grep -v -E '^(call-gate|check-changes|pr-test-amd-finish|cancel|check-all-jobs)$' | \ + jq -R -s -c 'split("\n") | map(select(length > 0))') + echo "pr_jobs=$pr_jobs" >> $GITHUB_OUTPUT + echo "PR jobs: $pr_jobs" + + # Parse nightly-test-amd.yml and extract job names (exclude utility jobs) + # Excluded: check-all-jobs + nightly_jobs=$(yq -r '.jobs | keys | .[]' .github/workflows/nightly-test-amd.yml | \ + grep -v -E '^(check-all-jobs)$' | \ + jq -R -s -c 'split("\n") | map(select(length > 0))') + echo "nightly_jobs=$nightly_jobs" >> $GITHUB_OUTPUT + echo "Nightly jobs: $nightly_jobs" + + # PR CI reports using dynamic matrix + pr-ci-reports: + name: PR - ${{ matrix.job_name }} + needs: parse-workflows + if: ${{ !inputs.job_filter }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + job_name: ${{ fromJson(needs.parse-workflows.outputs.pr_jobs) }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: pip install tabulate + + - name: Generate Report + timeout-minutes: 15 + run: | + python scripts/ci/query_job_status.py \ + --repo ${{ github.repository }} \ + --job "${{ matrix.job_name }}" \ + --workflow "pr-test-amd.yml" \ + --hours ${{ inputs.hours || '24' }} \ + --summary + + # Nightly AMD test reports using dynamic matrix + nightly-reports: + name: Nightly - ${{ matrix.job_name }} + needs: parse-workflows + if: ${{ !inputs.job_filter }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + job_name: ${{ fromJson(needs.parse-workflows.outputs.nightly_jobs) }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: pip install tabulate + + - name: Generate Nightly Report + timeout-minutes: 15 + run: | + python scripts/ci/query_job_status.py \ + --repo ${{ github.repository }} \ + --job "${{ matrix.job_name }}" \ + --workflow "nightly-test-amd.yml" \ + --hours ${{ inputs.hours || '24' }} \ + --summary diff --git a/.github/workflows/bot-bump-flashinfer-version.yml b/.github/workflows/bot-bump-flashinfer-version.yml new file mode 100644 index 000000000000..cc1cba930ce2 --- /dev/null +++ b/.github/workflows/bot-bump-flashinfer-version.yml @@ -0,0 +1,50 @@ +name: Bot Bump Flashinfer Version + +on: + workflow_dispatch: + inputs: + new_version: + description: 'New flashinfer version (e.g., 0.6.4)' + required: true + type: string + +permissions: + contents: write + pull-requests: write + +jobs: + bump-flashinfer-version: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install Python dependencies + run: | + pip install tomli + + - name: Configure Git and branch + run: | + git config user.name "sglang-bot" + git config user.email "sglang-bot@users.noreply.github.com" + RANDOM_SUFFIX=$(echo $RANDOM | md5sum | head -c 4) + BRANCH_NAME="bot/bump-flashinfer-version-${{ github.event.inputs.new_version }}-${RANDOM_SUFFIX}" + git checkout -b "$BRANCH_NAME" + echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV + + - name: Run flashinfer version bump script + run: | + python scripts/release/bump_flashinfer_version.py "${{ github.event.inputs.new_version }}" + + - name: Commit and create PR + env: + GH_TOKEN: ${{ secrets.GH_PAT_FOR_PULL_REQUEST }} + run: | + bash scripts/release/commit_and_pr.sh "flashinfer" "${{ github.event.inputs.new_version }}" "$BRANCH_NAME" diff --git a/.github/workflows/diffusion-ci-gt-gen.yml b/.github/workflows/diffusion-ci-gt-gen.yml new file mode 100644 index 000000000000..ef039180b0c3 --- /dev/null +++ b/.github/workflows/diffusion-ci-gt-gen.yml @@ -0,0 +1,129 @@ +name: Diffusion CI Ground Truth Generation + +on: + workflow_dispatch: + inputs: + ref: + description: 'Git ref to checkout' + required: false + default: '' + type: string + case_ids: + description: 'Specific case IDs to run (space-separated, optional)' + required: false + default: '' + type: string + +concurrency: + group: diffusion-ci-gt-gen-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: write + actions: read + +jobs: + multimodal-diffusion-gen-1gpu: + if: github.repository == 'sgl-project/sglang' + runs-on: 1-gpu-runner + strategy: + matrix: + part: [0, 1] + timeout-minutes: 60 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Install dependencies + run: bash scripts/ci/ci_install_dependency.sh diffusion + + - name: Generate outputs + run: | + cd python + python -m sglang.multimodal_gen.test.scripts.gen_diffusion_ci_outputs \ + --suite 1-gpu \ + --partition-id ${{ matrix.part }} \ + --total-partitions 2 \ + --out-dir ./diffusion-ci-outputs \ + --continue-on-error \ + ${{ inputs.case_ids != '' && format('--case-ids {0}', inputs.case_ids) || '' }} + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: diffusion-gen-1gpu-part${{ matrix.part }} + path: python/diffusion-ci-outputs + retention-days: 7 + + multimodal-diffusion-gen-2gpu: + if: github.repository == 'sgl-project/sglang' + runs-on: 2-gpu-runner + strategy: + matrix: + part: [0, 1] + timeout-minutes: 60 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Install dependencies + run: bash scripts/ci/ci_install_dependency.sh diffusion + + - name: Generate outputs + run: | + cd python + python -m sglang.multimodal_gen.test.scripts.gen_diffusion_ci_outputs \ + --suite 2-gpu \ + --partition-id ${{ matrix.part }} \ + --total-partitions 2 \ + --out-dir ./diffusion-ci-outputs \ + --continue-on-error \ + ${{ inputs.case_ids != '' && format('--case-ids {0}', inputs.case_ids) || '' }} + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: diffusion-gen-2gpu-part${{ matrix.part }} + path: python/diffusion-ci-outputs + retention-days: 7 + + diffusion-ci-push: + needs: [multimodal-diffusion-gen-1gpu, multimodal-diffusion-gen-2gpu] + if: github.repository == 'sgl-project/sglang' + runs-on: ubuntu-latest + steps: + - name: Checkout sgl-test-files + uses: actions/checkout@v4 + with: + repository: sgl-project/sgl-test-files + path: sgl-test-files + ref: main + token: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + pattern: diffusion-gen-* + path: combined + merge-multiple: true + + - name: Copy image and video frame files + run: | + mkdir -p sgl-test-files/diffusion-ci/consistency_gt + find combined \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.webp" \) -type f -exec cp -f {} sgl-test-files/diffusion-ci/consistency_gt/ \; + + - name: Git commit and push + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + run: | + cd sgl-test-files + git config user.email "github-actions[bot]@users.noreply.github.com" + git config user.name "github-actions[bot]" + git remote set-url origin "https://x-access-token:${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}@github.com/sgl-project/sgl-test-files.git" + git add diffusion-ci/consistency_gt/ + git diff --staged --quiet || git commit -m "diffusion-ci: update consistency_gt images [automated]" + git push origin main diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml index 953f34b72cbc..b24fe8c81b7f 100644 --- a/.github/workflows/execute-notebook.yml +++ b/.github/workflows/execute-notebook.yml @@ -3,9 +3,12 @@ name: Execute Notebooks on: pull_request: branches: [ main ] + types: [opened, synchronize, reopened, labeled] paths: - "python/sglang/**" - "docs/**" + - "!python/sglang/**/*.md" + - "!docs/**/*.md" workflow_dispatch: @@ -13,11 +16,20 @@ concurrency: group: execute-notebook-${{ github.ref }} cancel-in-progress: true +env: + SGLANG_IS_IN_CI: true jobs: + call-gate: + # Align with PR Test: fail fast if PR doesn't have run-ci label. + # This makes /tag-and-rerun-ci work by rerunning this failed workflow. + uses: ./.github/workflows/pr-gate.yml + secrets: inherit + run-all-notebooks: + needs: [call-gate] runs-on: 1-gpu-runner - if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci') + if: github.event_name != 'pull_request' || needs.call-gate.result == 'success' steps: - name: Checkout code uses: actions/checkout@v4 @@ -43,9 +55,11 @@ jobs: notebook-finish: needs: [ + call-gate, run-all-notebooks ] runs-on: ubuntu-latest + if: always() && needs.run-all-notebooks.result != 'skipped' steps: - name: Check all dependent job statuses run: | diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 018060fa42b2..80569a220169 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -26,25 +26,9 @@ jobs: run: SKIP=no-commit-to-branch pre-commit run --all-files --show-diff-on-failure - name: Run sgl-kernel clang-format checks - uses: DoozyX/clang-format-lint-action@v0.18.1 + uses: DoozyX/clang-format-lint-action@v0.20 with: source: sgl-kernel extensions: h,c,cpp,hpp,cu,cuh,cc - clangFormatVersion: 18 + clangFormatVersion: 20 style: file - - - name: Check proto files are in sync - run: | - if ! diff -q python/sglang/srt/grpc/sglang_scheduler.proto sgl-model-gateway/src/proto/sglang_scheduler.proto; then - echo "❌ ERROR: Proto files are out of sync!" - echo "" - echo "The following files must be kept identical:" - echo " - python/sglang/srt/grpc/sglang_scheduler.proto" - echo " - sgl-model-gateway/src/proto/sglang_scheduler.proto" - echo "" - echo "Please ensure both files have the same content." - echo "" - echo "Differences:" - diff python/sglang/srt/grpc/sglang_scheduler.proto sgl-model-gateway/src/proto/sglang_scheduler.proto || true - exit 1 - fi diff --git a/.github/workflows/list-active-pr-runs.yml.yml b/.github/workflows/list-active-pr-runs.yml.yml index e8f21297c489..10deab8374cf 100644 --- a/.github/workflows/list-active-pr-runs.yml.yml +++ b/.github/workflows/list-active-pr-runs.yml.yml @@ -1,4 +1,4 @@ -name: List Active PR Runs +name: List Active Runs on: workflow_dispatch: @@ -15,13 +15,13 @@ permissions: pull-requests: read jobs: - list-active-pr-runs: + list-active-runs: runs-on: ubuntu-latest steps: - name: Install GitHub CLI run: sudo apt-get install -y gh jq - - name: List active PR runs grouped by PR + - name: List active runs grouped by PR env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} @@ -31,7 +31,7 @@ jobs: set -euo pipefail echo "=========================================" - echo "🔍 Active PR Workflow Runs Report" + echo "🔍 Active Workflow Runs Report" echo "=========================================" echo "" @@ -54,7 +54,7 @@ jobs: --workflow "$workflow_file" \ --json databaseId,status,event,headBranch,createdAt,updatedAt,headSha,number,attempt \ --limit 500 \ - | jq -c '.[] | select(.status=="queued" or .status=="waiting" or .status=="in_progress") | select(.event=="pull_request")') + | jq -c '.[] | select(.status=="queued" or .status=="waiting" or .status=="in_progress")') if [ -z "$active_runs" ]; then continue @@ -64,6 +64,7 @@ jobs: echo "$active_runs" | while read -r run; do run_id=$(echo "$run" | jq -r '.databaseId') run_status=$(echo "$run" | jq -r '.status') + run_event=$(echo "$run" | jq -r '.event') created_at=$(echo "$run" | jq -r '.createdAt') head_sha=$(echo "$run" | jq -r '.headSha') run_number=$(echo "$run" | jq -r '.number') @@ -83,12 +84,12 @@ jobs: continue fi - # Find PR number + # Find PR number (may be empty for non-PR runs) pr_number=$(gh api "repos/$REPO/pulls?state=open&head=${head_owner}:${head_branch}" \ --jq '.[0].number // empty' 2>/dev/null || true) if [ -z "$pr_number" ]; then - continue + pr_number="NO_PR" fi # Get jobs for this run (with pagination to avoid missing jobs) @@ -106,25 +107,25 @@ jobs: queue_time=$((current_time - created_time)) queue_minutes=$((queue_time / 60)) - # Store data in temporary file - echo "$pr_number|$workflow_file|$run_id|$run_status|$running_jobs|$queued_jobs|$runners|$queue_minutes|$created_at|$head_sha|$run_attempt" >> "$pr_data_file" + # Store data in temporary file (unified format with event and branch) + echo "$pr_number|$workflow_file|$run_id|$run_status|$running_jobs|$queued_jobs|$runners|$queue_minutes|$created_at|$head_sha|$run_attempt|$run_event|$head_branch" >> "$pr_data_file" done done echo "" echo "=========================================" - echo "📊 Active PRs Summary" + echo "📊 Active Runs Summary" echo "=========================================" echo "" if [ ! -s "$pr_data_file" ]; then - echo "✅ No active PR runs found" + echo "✅ No active runs found" rm -f "$pr_data_file" exit 0 fi - # Get unique PR numbers - pr_numbers=$(cat "$pr_data_file" | cut -d'|' -f1 | sort -u) + # Get unique PR numbers (exclude NO_PR entries) + pr_numbers=$(cut -d'|' -f1 < "$pr_data_file" | grep -v '^NO_PR$' | sort -u || true) # Separate high priority and normal PRs high_priority_prs=() @@ -240,11 +241,74 @@ jobs: echo "" done + # --- Non-PR Runs Section --- + non_pr_runs=$(grep '^NO_PR|' "$pr_data_file" 2>/dev/null || true) + non_pr_running=0 + non_pr_queued=0 + + if [ -n "$non_pr_runs" ]; then + echo "=========================================" + echo "📦 Non-PR Runs (manual / scheduled / other)" + echo "=========================================" + echo "" + + echo "$non_pr_runs" | while read -r line; do + workflow=$(echo "$line" | cut -d'|' -f2) + run_id=$(echo "$line" | cut -d'|' -f3) + status=$(echo "$line" | cut -d'|' -f4) + running=$(echo "$line" | cut -d'|' -f5) + queued=$(echo "$line" | cut -d'|' -f6) + runners=$(echo "$line" | cut -d'|' -f7) + queue_min=$(echo "$line" | cut -d'|' -f8) + created=$(echo "$line" | cut -d'|' -f9) + attempt=$(echo "$line" | cut -d'|' -f11) + event=$(echo "$line" | cut -d'|' -f12) + branch=$(echo "$line" | cut -d'|' -f13) + + run_url="https://github.com/$REPO/actions/runs/$run_id" + + retry_count=$((attempt - 1)) + retry_indicator="" + if [ "$retry_count" -gt 0 ]; then + retry_indicator=" 🔄 Retry #$retry_count" + fi + + echo " 📦 Workflow: $workflow (Run #$run_id)$retry_indicator" + echo " Event: $event" + echo " Branch: $branch" + echo " Status: $status" + echo " 🟢 Running jobs: $running" + echo " 🟡 Queued jobs: $queued" + + if [ "$running" -gt 0 ] && [ "$runners" != "" ]; then + echo " 🖥️ Runners: $runners" + fi + + if [ "$queue_min" -gt 0 ]; then + echo " ⏱️ Queue time: ${queue_min} minutes" + fi + + echo " 🔗 Run URL: $run_url" + echo "" + done + + non_pr_running=$(echo "$non_pr_runs" | cut -d'|' -f5 | awk '{sum+=$1} END {print sum+0}') + non_pr_queued=$(echo "$non_pr_runs" | cut -d'|' -f6 | awk '{sum+=$1} END {print sum+0}') + non_pr_count=$(echo "$non_pr_runs" | wc -l | tr -d ' ') + + total_running=$((total_running + non_pr_running)) + total_queued=$((total_queued + non_pr_queued)) + + echo " 📊 Non-PR Total: $non_pr_running running, $non_pr_queued queued" + echo "" + fi + # Overall summary echo "=========================================" echo "📈 Overall Summary" echo "=========================================" echo "Total PRs with active runs: $pr_count" + echo "Total non-PR active runs: ${non_pr_count:-0}" echo "Total running jobs: $total_running" echo "Total queued jobs: $total_queued" echo "=========================================" diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml new file mode 100644 index 000000000000..d94c0f86ccd7 --- /dev/null +++ b/.github/workflows/nightly-test-amd-rocm720.yml @@ -0,0 +1,1366 @@ +name: Nightly Test (AMD ROCm 7.2) + +on: + schedule: + - cron: '0 2 * * *' + push: + branches: + - main + paths: + - "python/sglang/version.py" + workflow_dispatch: + inputs: + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + job_select: + description: 'Select a job to run from dropdown (choose "all" to run all jobs)' + required: false + type: choice + default: 'all' + options: + - 'all' + - nightly-test-1-gpu-unit-rocm720 + - nightly-accuracy-2-gpu-rocm720 + - nightly-accuracy-2-gpu-vlm-rocm720 + - nightly-perf-2-gpu-text-rocm720 + - nightly-perf-2-gpu-vlm-rocm720 + - nightly-accuracy-8-gpu-rocm720 + - nightly-8-gpu-grok1-int4-rocm720 + - nightly-8-gpu-grok2-rocm720 + - nightly-8-gpu-deepseek-v31-rocm720 + - nightly-8-gpu-deepseek-v32-rocm720 + - nightly-8-gpu-deepseek-v32-mtp-rocm720 + - nightly-8-gpu-deepseek-v3-kv-fp8-rocm720 + - nightly-8-gpu-kimi-k25-rocm720 + - nightly-8-gpu-qwen3-235b-rocm720 + - nightly-8-gpu-qwen35-rocm720 + - nightly-8-gpu-glm5-rocm720 + - nightly-8-gpu-minimax-m25-rocm720 + - nightly-1-gpu-zimage-turbo-rocm720 + - nightly-test-1-gpu-mi35x-rocm720 + - nightly-accuracy-8-gpu-mi35x-rocm720 + - nightly-8-gpu-mi35x-grok1-int4-rocm720 + - nightly-8-gpu-mi35x-grok2-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720 + - nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720 + - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720 + - nightly-8-gpu-mi35x-kimi-k25-rocm720 + - nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720 + - nightly-8-gpu-mi35x-qwen35-rocm720 + - nightly-8-gpu-mi35x-glm5-rocm720 + - nightly-8-gpu-mi35x-minimax-m25-rocm720 + job_filter: + description: 'Or type comma-separated job names (overrides dropdown if non-empty)' + required: false + type: string + default: '' + workflow_call: + inputs: + ref: + description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' + required: false + type: string + default: '' + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + job_filter: + description: 'Select which job to run (leave empty or "all" to run all jobs)' + required: false + type: string + default: 'all' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + +env: + AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} + +concurrency: + # When called via workflow_call with ref set, use a unique group per caller run to avoid + # collisions with direct schedule/push triggers. We use inputs.ref (not github.event_name) + # to detect this, because github.event_name inherits from the caller in workflow_call. + group: nightly-test-amd-rocm720-${{ inputs.ref && format('caller-{0}', github.run_id) || github.ref }} + cancel-in-progress: ${{ !inputs.ref && github.event_name != 'workflow_call' }} + +jobs: + # ============================================== MI30x ROCm 7.2 Unit Tests ============================================== + # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x ROCm 7.2) + nightly-test-1-gpu-unit-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-test-1-gpu-unit-rocm720,')) + runs-on: linux-mi325-1gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Nightly Unit Test ROCm 7.2 (1-GPU) + timeout-minutes: 90 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 900 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI30x ROCm 7.2 Accuracy Tests ============================================== + # 2-GPU Accuracy Tests - GSM8K eval (MI30x ROCm 7.2) + nightly-accuracy-2-gpu-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-2-gpu-rocm720,')) + runs-on: linux-mi325-2gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Nightly Test ROCm 7.2 (2-GPU) + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation (ROCm 7.2) + nightly-accuracy-2-gpu-vlm-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-2-gpu-vlm-rocm720,')) + runs-on: linux-mi325-2gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Nightly Accuracy Test ROCm 7.2 (2-GPU VLM MMMU) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 2-GPU Text Models Performance Tests (ROCm 7.2) + nightly-perf-2-gpu-text-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-perf-2-gpu-text-rocm720,')) + runs-on: linux-mi325-2gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Performance Test ROCm 7.2 (2-GPU Text Models) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 2-GPU VLM Performance Tests (ROCm 7.2) + nightly-perf-2-gpu-vlm-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-perf-2-gpu-vlm-rocm720,')) + runs-on: linux-mi325-2gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Performance Test ROCm 7.2 (2-GPU VLM Models) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (ROCm 7.2) + nightly-accuracy-8-gpu-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-8-gpu-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test ROCm 7.2 (8-GPU GPT-OSS) + timeout-minutes: 180 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-FP8) + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI30x ROCm 7.2 Combined Accuracy + Performance Tests ============================================== + # 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-grok1-int4-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-grok1-int4-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-INT4) + timeout-minutes: 60 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU Grok1-INT4) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-grok2-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-grok2-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test ROCm 7.2 (8-GPU Grok2) + timeout-minutes: 60 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU Grok2) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU DeepSeek-V3.1 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-deepseek-v31-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-deepseek-v31-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.1) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.1) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_ROCM700A=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf) ROCm 7.2 + nightly-8-gpu-deepseek-v32-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-deepseek-v32-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 150 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf) ROCm 7.2 + nightly-8-gpu-deepseek-v32-mtp-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-deepseek-v32-mtp-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 180 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU DeepSeek-V3 KV FP8 (Basic + MTP with --kv-cache-dtype fp8_e4m3) ROCm 7.2 + nightly-8-gpu-deepseek-v3-kv-fp8-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-deepseek-v3-kv-fp8-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: DeepSeek-V3 KV FP8 Test ROCm 7.2 (8-GPU Basic + MTP) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-deepseek-v3-kv-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Kimi-K2.5 (Accuracy) ROCm 7.2 + nightly-8-gpu-kimi-k25-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-kimi-k25-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test ROCm 7.2 (8-GPU Kimi-K2.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Qwen3-235B (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-qwen3-235b-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen3-235b-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test + Performance Test ROCm 7.2 (8-GPU Qwen3) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-8-gpu-qwen3-235b --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2 + nightly-8-gpu-qwen35-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git mistral-common "lm-eval[api]" --upgrade + + - name: Accuracy Test ROCm 7.2 (8-GPU Qwen 3.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-qwen35 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU GLM-5 (Accuracy) ROCm 7.2 + nightly-8-gpu-glm5-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # GLM-5 requires latest transformers for glm_moe_dsa architecture + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git + + - name: Accuracy Test ROCm 7.2 (8-GPU GLM-5 NSA) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-glm5 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU MiniMax-M2.5 (Accuracy) ROCm 7.2 + nightly-8-gpu-minimax-m25-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-minimax-m25-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: Accuracy Test ROCm 7.2 (8-GPU MiniMax-M2.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-minimax-m25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI30x ROCm 7.2 Diffusion Tests ============================================== + # 1-GPU Z-Image-Turbo (Diffusion T2I) ROCm 7.2 + nightly-1-gpu-zimage-turbo-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-1-gpu-zimage-turbo-rocm720,')) + runs-on: linux-mi325-1gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Z-Image-Turbo Diffusion Test ROCm 7.2 (1-GPU) + timeout-minutes: 45 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + -e SGLANG_DIFFUSION_ARTIFACT_DIR="/sglang-checkout/diffusion-artifacts" \ + pytest test/registered/amd/test_zimage_turbo.py -v -s ${{ inputs.continue_on_error && '|| true' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Upload generated images + if: always() + uses: actions/upload-artifact@v4 + with: + name: zimage-turbo-outputs-rocm720 + path: diffusion-artifacts/ + if-no-files-found: ignore + retention-days: 30 + + # ============================================== MI35x ROCm 7.2 Tests ============================================== + # MI35x 1-GPU ROCm 7.2 tests + nightly-test-1-gpu-mi35x-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-test-1-gpu-mi35x-rocm720,')) + runs-on: linux-mi35x-gpu-1 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Nightly Test MI35x ROCm 7.2 (1-GPU) + timeout-minutes: 90 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 900 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Accuracy Tests - GPT-OSS (ROCm 7.2) + nightly-accuracy-8-gpu-mi35x-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-8-gpu-mi35x-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU GPT-OSS) + timeout-minutes: 180 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-grok1-int4-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-grok1-int4-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Grok1-INT4) + timeout-minutes: 60 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU Grok1-INT4) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-grok2-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-grok2-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Grok2) + timeout-minutes: 60 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU Grok2) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-R1-MXFP4 KV FP8 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 KV FP8) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 KV FP8) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test (ROCm 7.2) + nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test (ROCm 7.2) + nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 TP+MTP) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic) ROCm 7.2 + nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 150 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Kimi-K2.5 (Accuracy) ROCm 7.2 + nightly-8-gpu-mi35x-kimi-k25-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-kimi-k25-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Kimi-K2.5) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k25 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Qwen3-235B-MXFP4 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test + Performance Test MI35x ROCm 7.2 (8-GPU Qwen3-235B-MXFP4) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-8-gpu-mi35x-qwen3-235b-mxfp4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2 + nightly-8-gpu-mi35x-qwen35-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git mistral-common "lm-eval[api]" --upgrade + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Qwen 3.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-qwen35 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-mi35x-glm5-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + # GLM-5 requires latest transformers for glm_moe_dsa architecture + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU GLM-5 NSA) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-glm5 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU MiniMax-M2.5 (Accuracy) ROCm 7.2 + nightly-8-gpu-mi35x-minimax-m25-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-minimax-m25-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU MiniMax-M2.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-minimax-m25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP) ROCm 7.2 + nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + check-all-jobs: + if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch') + needs: + # MI30x ROCm 7.2 Unit Tests + - nightly-test-1-gpu-unit-rocm720 + # MI30x ROCm 7.2 Accuracy Tests + - nightly-accuracy-2-gpu-rocm720 + - nightly-accuracy-2-gpu-vlm-rocm720 + # MI30x ROCm 7.2 Performance Tests + - nightly-perf-2-gpu-text-rocm720 + - nightly-perf-2-gpu-vlm-rocm720 + - nightly-accuracy-8-gpu-rocm720 + # MI30x ROCm 7.2 Combined Accuracy + Performance Tests + - nightly-8-gpu-grok1-int4-rocm720 + - nightly-8-gpu-grok2-rocm720 + - nightly-8-gpu-deepseek-v31-rocm720 + - nightly-8-gpu-deepseek-v32-rocm720 + - nightly-8-gpu-deepseek-v32-mtp-rocm720 + - nightly-8-gpu-deepseek-v3-kv-fp8-rocm720 + - nightly-8-gpu-kimi-k25-rocm720 + - nightly-8-gpu-qwen3-235b-rocm720 + - nightly-8-gpu-qwen35-rocm720 + - nightly-8-gpu-glm5-rocm720 + - nightly-8-gpu-minimax-m25-rocm720 + # MI30x ROCm 7.2 Diffusion Tests + - nightly-1-gpu-zimage-turbo-rocm720 + # MI35x ROCm 7.2 jobs + - nightly-test-1-gpu-mi35x-rocm720 + - nightly-accuracy-8-gpu-mi35x-rocm720 + - nightly-8-gpu-mi35x-grok1-int4-rocm720 + - nightly-8-gpu-mi35x-grok2-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720 + - nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720 + - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720 + - nightly-8-gpu-mi35x-kimi-k25-rocm720 + - nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720 + - nightly-8-gpu-mi35x-qwen35-rocm720 + - nightly-8-gpu-mi35x-glm5-rocm720 + - nightly-8-gpu-mi35x-minimax-m25-rocm720 + runs-on: ubuntu-latest + steps: + - name: Check if any job failed + run: | + if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then + echo "One or more ROCm 7.2 nightly test jobs failed" + exit 1 + fi + if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then + echo "One or more ROCm 7.2 nightly test jobs were cancelled" + exit 1 + fi + echo "All ROCm 7.2 nightly test jobs passed" diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index bfde6130fd72..5aa969b73c98 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -10,39 +10,62 @@ on: - "python/sglang/version.py" workflow_dispatch: inputs: - job_filter: - description: 'Select which job to run (leave empty or "all" to run all jobs)' + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + job_select: + description: 'Select a job to run from dropdown (choose "all" to run all jobs)' required: false type: choice default: 'all' options: - 'all' - # MI30x Unit Tests - - 'nightly-test-1-gpu-unit' - # MI30x Accuracy Tests (GSM8K / MMMU) - - 'nightly-accuracy-2-gpu' - - 'nightly-accuracy-2-gpu-vlm' - - 'nightly-perf-2-gpu-text' - - 'nightly-perf-2-gpu-vlm' - - 'nightly-accuracy-8-gpu' - # MI30x Accuracy + Performance Tests (combined) - - 'nightly-8-gpu-grok1-int4' - - 'nightly-8-gpu-grok2' - - 'nightly-8-gpu-deepseek-v31' - - 'nightly-8-gpu-deepseek-v32' - - 'nightly-8-gpu-deepseek-v32-mtp' - - 'nightly-8-gpu-kimi-k2' - # MI35x jobs - - 'nightly-test-1-gpu-mi35x' - - 'nightly-8-gpu-mi35x-kimi-k2' - - 'nightly-accuracy-8-gpu-mi35x' - - 'nightly-8-gpu-mi35x-grok1-int4' - - 'nightly-8-gpu-mi35x-grok2' - - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4' - - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32' - - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp' - - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic' - - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp' + - nightly-test-1-gpu-unit + - nightly-accuracy-2-gpu + - nightly-accuracy-2-gpu-vlm + - nightly-perf-2-gpu-text + - nightly-perf-2-gpu-vlm + - nightly-accuracy-8-gpu + - nightly-8-gpu-grok1-int4 + - nightly-8-gpu-grok2 + - nightly-8-gpu-deepseek-v31 + - nightly-8-gpu-deepseek-v32 + - nightly-8-gpu-deepseek-v32-mtp + - nightly-8-gpu-deepseek-v3-kv-fp8 + - nightly-8-gpu-kimi-k25 + - nightly-8-gpu-qwen3-235b + - nightly-8-gpu-qwen35 + - nightly-8-gpu-glm5 + - nightly-8-gpu-minimax-m25 + - nightly-1-gpu-zimage-turbo + - nightly-test-1-gpu-mi35x + - nightly-accuracy-8-gpu-mi35x + - nightly-8-gpu-mi35x-grok1-int4 + - nightly-8-gpu-mi35x-grok2 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion + - nightly-accuracy-8-gpu-mi35x-deepseek-v32 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp + - nightly-perf-8-gpu-mi35x-deepseek-v32-basic + - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp + - nightly-8-gpu-mi35x-kimi-k25 + - nightly-8-gpu-mi35x-qwen3-235b-mxfp4 + - nightly-8-gpu-mi35x-qwen35 + - nightly-8-gpu-mi35x-glm5 + - nightly-8-gpu-mi35x-minimax-m25 + job_filter: + description: 'Or type comma-separated job names (overrides dropdown if non-empty)' + required: false + type: string + default: '' workflow_call: inputs: ref: @@ -50,22 +73,38 @@ on: required: false type: string default: '' + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' job_filter: description: 'Select which job to run (leave empty or "all" to run all jobs)' required: false type: string default: 'all' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + +env: + AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} concurrency: - group: nightly-test-amd-${{ inputs.ref || github.ref }} - cancel-in-progress: ${{ github.event_name != 'workflow_call' }} + # When called via workflow_call with ref set, use a unique group per caller run to avoid + # collisions with direct schedule/push triggers. We use inputs.ref (not github.event_name) + # to detect this, because github.event_name inherits from the caller in workflow_call. + group: nightly-test-amd-${{ inputs.ref && format('caller-{0}', github.run_id) || github.ref }} + cancel-in-progress: ${{ !inputs.ref && github.event_name != 'workflow_call' }} jobs: # ============================================== MI30x Unit Tests ============================================== # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x only) nightly-test-1-gpu-unit: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit') - runs-on: linux-mi325-gpu-1 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-test-1-gpu-unit,')) + runs-on: linux-mi325-1gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -83,19 +122,19 @@ jobs: run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Nightly Unit Test (1-GPU) - timeout-minutes: 60 + timeout-minutes: 90 run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 900 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # ============================================== MI30x Accuracy Tests ============================================== # 2-GPU Accuracy Tests - GSM8K eval (MI30x only) nightly-accuracy-2-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu') - runs-on: linux-mi325-gpu-2 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-2-gpu,')) + runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -117,14 +156,14 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation nightly-accuracy-2-gpu-vlm: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm') - runs-on: linux-mi325-gpu-2 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-2-gpu-vlm,')) + runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -147,14 +186,14 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # 2-GPU Text Models Performance Tests nightly-perf-2-gpu-text: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text') - runs-on: linux-mi325-gpu-2 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-perf-2-gpu-text,')) + runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -178,14 +217,14 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # 2-GPU VLM Performance Tests nightly-perf-2-gpu-vlm: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm') - runs-on: linux-mi325-gpu-2 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-perf-2-gpu-vlm,')) + runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -209,14 +248,14 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (accuracy only) nightly-accuracy-8-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu') - runs-on: linux-mi325-gpu-8 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-8-gpu,')) + runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -238,7 +277,7 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -248,15 +287,15 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # ============================================== MI30x Combined Accuracy + Performance Tests ============================================== # 8-GPU Grok1-INT4 (Accuracy + Performance combined) nightly-8-gpu-grok1-int4: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4') - runs-on: linux-mi325-gpu-8 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-grok1-int4,')) + runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -280,7 +319,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -292,14 +331,14 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # 8-GPU Grok2 (Accuracy + Performance combined) nightly-8-gpu-grok2: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2') - runs-on: linux-mi325-gpu-8 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-grok2,')) + runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -323,7 +362,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -335,14 +374,14 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # 8-GPU DeepSeek-V3.1 (Accuracy + Performance combined) nightly-8-gpu-deepseek-v31: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31') - runs-on: linux-mi325-gpu-8 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-deepseek-v31,')) + runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -366,7 +405,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -378,14 +417,14 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_ROCM700A=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf) nightly-8-gpu-deepseek-v32: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32') - runs-on: linux-mi325-gpu-8 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-deepseek-v32,')) + runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -408,7 +447,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -419,14 +458,14 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf) nightly-8-gpu-deepseek-v32-mtp: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-mtp') - runs-on: linux-mi325-gpu-8 + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-deepseek-v32-mtp,')) + runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -449,7 +488,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -460,14 +499,73 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU DeepSeek-V3 KV FP8 (Basic + MTP with --kv-cache-dtype fp8_e4m3) + nightly-8-gpu-deepseek-v3-kv-fp8: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-deepseek-v3-kv-fp8,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: DeepSeek-V3 KV FP8 Test (8-GPU Basic + MTP) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-deepseek-v3-kv-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Kimi-K2.5 (Accuracy) + nightly-8-gpu-kimi-k25: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-kimi-k25,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Accuracy Test (8-GPU Kimi-K2.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU Kimi-K2 (Accuracy + Speed) - nightly-8-gpu-kimi-k2: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-kimi-k2') - runs-on: linux-mi325-gpu-8 + nightly-8-gpu-qwen3-235b: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen3-235b,')) + runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code uses: actions/checkout@v4 @@ -484,20 +582,155 @@ jobs: - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - - name: Accuracy Test (8-GPU Kimi-K2) + - name: Accuracy Test + Performance Test (8-GPU Qwen3) timeout-minutes: 120 run: | > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-8-gpu-qwen3-235b --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # 8-GPU Qwen 3.5 (Accuracy) + nightly-8-gpu-qwen35: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git mistral-common "lm-eval[api]" --upgrade + + - name: Accuracy Test (8-GPU Qwen 3.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-qwen35 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-glm5: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + # GLM-5 requires latest transformers for glm_moe_dsa architecture + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git + + - name: Accuracy Test (8-GPU GLM-5 NSA) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-glm5 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU MiniMax-M2.5 (Accuracy) + nightly-8-gpu-minimax-m25: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-minimax-m25,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Accuracy Test (8-GPU MiniMax-M2.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-minimax-m25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI30x Diffusion Tests ============================================== + # 1-GPU Z-Image-Turbo (Diffusion T2I) + nightly-1-gpu-zimage-turbo: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-1-gpu-zimage-turbo,')) + runs-on: linux-mi325-1gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Z-Image-Turbo Diffusion Test (1-GPU) + timeout-minutes: 45 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + -e SGLANG_DIFFUSION_ARTIFACT_DIR="/sglang-checkout/diffusion-artifacts" \ + pytest test/registered/amd/test_zimage_turbo.py -v -s ${{ inputs.continue_on_error && '|| true' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Upload generated images + if: always() + uses: actions/upload-artifact@v4 + with: + name: zimage-turbo-outputs + path: diffusion-artifacts/ + if-no-files-found: ignore + retention-days: 30 + # ============================================== MI35x Tests ============================================== # MI35x 1-GPU tests - platform-agnostic tests that may work on CDNA4 (gfx950) nightly-test-1-gpu-mi35x: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-test-1-gpu-mi35x,')) runs-on: linux-mi35x-gpu-1 steps: - name: Checkout code @@ -519,17 +752,17 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate - name: Nightly Test MI35x (1-GPU) - timeout-minutes: 60 + timeout-minutes: 90 run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 900 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # MI35x 8-GPU Accuracy Tests - GPT-OSS (accuracy only) nightly-accuracy-8-gpu-mi35x: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-8-gpu-mi35x,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -555,13 +788,13 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance combined) nightly-8-gpu-mi35x-grok1-int4: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-grok1-int4,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -589,7 +822,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -601,13 +834,13 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # MI35x 8-GPU Grok2 (Accuracy + Performance combined) nightly-8-gpu-mi35x-grok2: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-grok2,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -635,7 +868,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -647,13 +880,13 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance combined) nightly-8-gpu-mi35x-deepseek-r1-mxfp4: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -680,7 +913,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -695,9 +928,97 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU DeepSeek-R1-MXFP4 KV FP8 (Accuracy + Performance combined) + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-MXFP4 KV FP8) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x (8-GPU DeepSeek-R1-MXFP4 KV FP8) + timeout-minutes: 300 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion (Accuracy + Performance combined) + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion) + timeout-minutes: 300 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test nightly-accuracy-8-gpu-mi35x-deepseek-v32: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -724,13 +1045,13 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -757,13 +1078,13 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic) nightly-perf-8-gpu-mi35x-deepseek-v32-basic: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-perf-8-gpu-mi35x-deepseek-v32-basic,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -790,13 +1111,13 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # MI35x 8-GPU Kimi-K2 (Accuracy) - nightly-8-gpu-mi35x-kimi-k2: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k2') + # MI35x 8-GPU Kimi-K2.5 (Accuracy) + nightly-8-gpu-mi35x-kimi-k25: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-kimi-k25,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -817,19 +1138,152 @@ jobs: # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate - - name: Accuracy Test MI35x (8-GPU Kimi-K2) + - name: Accuracy Test MI35x (8-GPU Kimi-K2.5) timeout-minutes: 180 run: | > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k2 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k25 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Qwen3-235B-MXFP4 (Accuracy + Performance) + nightly-8-gpu-mi35x-qwen3-235b-mxfp4: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen3-235b-mxfp4,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test + Performance Test MI35x (8-GPU Qwen3-235B-MXFP4) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-8-gpu-mi35x-qwen3-235b-mxfp4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Qwen 3.5 (Accuracy) + nightly-8-gpu-mi35x-qwen35: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git mistral-common "lm-eval[api]" --upgrade + + - name: Accuracy Test MI35x (8-GPU Qwen 3.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-qwen35 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-mi35x-glm5: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + # GLM-5 requires latest transformers for glm_moe_dsa architecture + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git + + - name: Accuracy Test MI35x (8-GPU GLM-5 NSA) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-glm5 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU MiniMax-M2.5 (Accuracy) + nightly-8-gpu-mi35x-minimax-m25: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-minimax-m25,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x (8-GPU MiniMax-M2.5) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-minimax-m25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP) nightly-perf-8-gpu-mi35x-deepseek-v32-mtp: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-perf-8-gpu-mi35x-deepseek-v32-mtp,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -856,7 +1310,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -878,16 +1332,29 @@ jobs: - nightly-8-gpu-deepseek-v31 - nightly-8-gpu-deepseek-v32 - nightly-8-gpu-deepseek-v32-mtp - - nightly-8-gpu-kimi-k2 + - nightly-8-gpu-deepseek-v3-kv-fp8 + - nightly-8-gpu-kimi-k25 + - nightly-8-gpu-qwen3-235b + - nightly-8-gpu-qwen35 + - nightly-8-gpu-glm5 + - nightly-8-gpu-minimax-m25 + # MI30x Diffusion Tests + - nightly-1-gpu-zimage-turbo # MI35x jobs - nightly-test-1-gpu-mi35x - nightly-accuracy-8-gpu-mi35x - nightly-8-gpu-mi35x-grok1-int4 - nightly-8-gpu-mi35x-grok2 - nightly-8-gpu-mi35x-deepseek-r1-mxfp4 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion - nightly-accuracy-8-gpu-mi35x-deepseek-v32 - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp - - nightly-8-gpu-mi35x-kimi-k2 + - nightly-8-gpu-mi35x-kimi-k25 + - nightly-8-gpu-mi35x-qwen3-235b-mxfp4 + - nightly-8-gpu-mi35x-qwen35 + - nightly-8-gpu-mi35x-glm5 + - nightly-8-gpu-mi35x-minimax-m25 # MI35x perf jobs excluded from check - perf failures don't block CI # - nightly-perf-8-gpu-mi35x-deepseek-v32-basic # - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp diff --git a/.github/workflows/nightly-test-npu.yml b/.github/workflows/nightly-test-npu.yml index 6705b0794a49..bfbbe80410af 100644 --- a/.github/workflows/nightly-test-npu.yml +++ b/.github/workflows/nightly-test-npu.yml @@ -55,11 +55,12 @@ jobs: # copy required file from our daily cache cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp # copy download through proxy - curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + curl -o /tmp/test.jsonl -L https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl - name: Print Log Information run: | bash scripts/ci/npu/npu_log_print.sh + - name: Run test timeout-minutes: 240 env: @@ -102,7 +103,7 @@ jobs: # copy required file from our daily cache cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp # copy download through proxy - curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + curl -o /tmp/test.jsonl -L https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl - name: Print Log Information run: | @@ -148,7 +149,7 @@ jobs: # copy required file from our daily cache cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp # copy download through proxy - curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + curl -o /tmp/test.jsonl -L https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl - name: Print Log Information run: | diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml index 5c78fde2e3a8..e95078cb7cc2 100644 --- a/.github/workflows/nightly-test-nvidia.yml +++ b/.github/workflows/nightly-test-nvidia.yml @@ -44,6 +44,7 @@ concurrency: env: SGLANG_IS_IN_CI: true + SGLANG_CUDA_COREDUMP: "1" HF_HUB_DOWNLOAD_TIMEOUT: 300 HF_HUB_ETAG_TIMEOUT: 300 @@ -68,6 +69,9 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-1-gpu --nightly --continue-on-error + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # General tests - 4 GPU H100 nightly-test-general-4-gpu-h100: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-4-gpu-h100') @@ -88,6 +92,9 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-4-gpu --nightly --continue-on-error + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # General tests - 8 GPU H200 nightly-test-general-8-gpu-h200: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-h200') @@ -120,6 +127,25 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=18000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4 + - name: Publish traces to storage repo + if: always() + continue-on-error: true + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + TRACE_ARGS="" + for dir in test/performance_profiles_*/; do + [ -d "$dir" ] && TRACE_ARGS="$TRACE_ARGS --traces-dir $dir" + done + if [ -n "$TRACE_ARGS" ]; then + python3 scripts/ci/utils/publish_traces.py $TRACE_ARGS + find test/performance_profiles_*/ -name '*.json.gz' -delete + else + echo "No trace directories found, skipping publish" + fi + - name: Run test timeout-minutes: 30 env: @@ -148,6 +174,11 @@ jobs: retention-days: 5 if-no-files-found: ignore + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.partition }} + # General tests - 8 GPU H20 nightly-test-general-8-gpu-h20: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-h20') @@ -172,6 +203,9 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-8-gpu-h20 --nightly --continue-on-error + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # General tests - 8 GPU B200 nightly-test-general-8-gpu-b200: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-b200') @@ -201,6 +235,25 @@ jobs: cd test IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=12000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4 + - name: Publish traces to storage repo + if: always() + continue-on-error: true + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + TRACE_ARGS="" + for dir in test/performance_profiles_*/; do + [ -d "$dir" ] && TRACE_ARGS="$TRACE_ARGS --traces-dir $dir" + done + if [ -n "$TRACE_ARGS" ]; then + python3 scripts/ci/utils/publish_traces.py $TRACE_ARGS + find test/performance_profiles_*/ -name '*.json.gz' -delete + else + echo "No trace directories found, skipping publish" + fi + - name: Collect performance metrics if: always() run: | @@ -221,6 +274,11 @@ jobs: retention-days: 5 if-no-files-found: ignore + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.partition }} + # Text model accuracy tests nightly-test-text-accuracy-2-gpu-runner: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-text-accuracy-2-gpu-runner') @@ -241,6 +299,9 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-eval-text-2-gpu --nightly --continue-on-error --timeout-per-file 4500 + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # Text model performance tests nightly-test-text-perf-2-gpu-runner: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-text-perf-2-gpu-runner') @@ -264,7 +325,7 @@ jobs: run: | cd test rm -rf performance_profiles_text_models/ - python3 run_suite.py --hw cuda --suite nightly-perf-text-2-gpu --nightly --continue-on-error + python3 run_suite.py --hw cuda --suite nightly-perf-text-2-gpu --nightly --continue-on-error --timeout-per-file 3600 - name: Publish traces to storage repo env: @@ -274,6 +335,9 @@ jobs: run: | python3 scripts/ci/utils/publish_traces.py --traces-dir test/performance_profiles_text_models + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # VLM accuracy tests nightly-test-vlm-accuracy-2-gpu-runner: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-vlm-accuracy-2-gpu-runner') @@ -294,6 +358,9 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-eval-vlm-2-gpu --nightly --continue-on-error --timeout-per-file 9000 + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # VLM performance tests nightly-test-vlm-perf-2-gpu-runner: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-vlm-perf-2-gpu-runner') @@ -317,7 +384,7 @@ jobs: run: | cd test rm -rf performance_profiles_vlms/ - python3 run_suite.py --hw cuda --suite nightly-perf-vlm-2-gpu --nightly --continue-on-error + python3 run_suite.py --hw cuda --suite nightly-perf-vlm-2-gpu --nightly --continue-on-error --timeout-per-file 3600 - name: Publish traces to storage repo env: @@ -327,6 +394,9 @@ jobs: run: | python3 scripts/ci/utils/publish_traces.py --traces-dir test/performance_profiles_vlms + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # diffusion performance tests nightly-test-multimodal-server-1-gpu: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-multimodal-server-1-gpu') @@ -351,6 +421,7 @@ jobs: env: SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }} GITHUB_RUN_ID: ${{ github.run_id }} + GPU_CONFIG: "1-gpu-runner" timeout-minutes: 60 run: | @@ -360,6 +431,28 @@ jobs: --partition-id ${{ matrix.part }} \ --total-partitions 2 + - name: Collect diffusion performance metrics + if: always() + run: | + python3 scripts/ci/save_diffusion_metrics.py \ + --gpu-config 1-gpu-runner \ + --run-id ${{ github.run_id }} \ + --output python/diffusion-metrics-1gpu-partition-${{ matrix.part }}.json \ + --results-json python/diffusion-results.json + + - name: Upload diffusion metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: diffusion-metrics-1gpu-partition-${{ matrix.part }} + path: python/diffusion-metrics-1gpu-partition-${{ matrix.part }}.json + retention-days: 90 + if-no-files-found: ignore + + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.part }} nightly-test-multimodal-server-2-gpu: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-multimodal-server-2-gpu') @@ -384,6 +477,7 @@ jobs: env: SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }} GITHUB_RUN_ID: ${{ github.run_id }} + GPU_CONFIG: "2-gpu-runner" timeout-minutes: 60 run: | @@ -393,6 +487,29 @@ jobs: --partition-id ${{ matrix.part }} \ --total-partitions 2 + - name: Collect diffusion performance metrics + if: always() + run: | + python3 scripts/ci/save_diffusion_metrics.py \ + --gpu-config 2-gpu-runner \ + --run-id ${{ github.run_id }} \ + --output python/diffusion-metrics-2gpu-partition-${{ matrix.part }}.json \ + --results-json python/diffusion-results.json + + - name: Upload diffusion metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: diffusion-metrics-2gpu-partition-${{ matrix.part }} + path: python/diffusion-metrics-2gpu-partition-${{ matrix.part }}.json + retention-days: 90 + if-no-files-found: ignore + + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.part }} + # B200 Performance tests - 4 GPU nightly-test-perf-4-gpu-b200: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-perf-4-gpu-b200') @@ -413,6 +530,9 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-4-gpu-b200 --nightly --continue-on-error --timeout-per-file 12000 + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # Specialized B200 tests - 8 GPU, for specific backends and configs nightly-test-specialized-8-gpu-b200: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-perf-8-gpu-b200') @@ -437,12 +557,17 @@ jobs: cd test python3 run_suite.py --hw cuda --suite nightly-8-gpu-b200 --nightly --continue-on-error --timeout-per-file 2400 - # Consolidate performance metrics from all 8-GPU jobs + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + + # Consolidate performance metrics from all jobs consolidate-metrics: if: github.repository == 'sgl-project/sglang' && always() needs: - nightly-test-general-8-gpu-h200 - nightly-test-general-8-gpu-b200 + - nightly-test-multimodal-server-1-gpu + - nightly-test-multimodal-server-2-gpu runs-on: ubuntu-latest steps: - name: Checkout code @@ -453,7 +578,7 @@ jobs: - name: Download all partition metrics uses: actions/download-artifact@v4 with: - pattern: metrics-* + pattern: "*metrics-*" path: metrics/ merge-multiple: true diff --git a/.github/workflows/patch-docker-dev.yml b/.github/workflows/patch-docker-dev.yml new file mode 100644 index 000000000000..30b560783ff5 --- /dev/null +++ b/.github/workflows/patch-docker-dev.yml @@ -0,0 +1,115 @@ +name: Patch Docker Image + +on: + workflow_dispatch: + inputs: + pr_numbers: + description: "Comma-separated PR numbers to apply (e.g. 18962,19010)" + required: false + default: "" + image_tag: + description: "Base image tag to patch (e.g. dev-x86, dev-x86-cu13)" + required: true + +concurrency: + group: patch-docker-${{ inputs.image_tag }} + cancel-in-progress: true + +jobs: + patch: + if: github.repository == 'sgl-project/sglang' + runs-on: x64-docker-build-node + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Pull base image and extract commit + run: | + IMAGE="lmsysorg/sglang:${{ inputs.image_tag }}" + docker pull "${IMAGE}" + if BASE_SHA=$(docker run --rm "${IMAGE}" git -C /sgl-workspace/sglang rev-parse HEAD 2>/dev/null); then + echo "Image built from commit: ${BASE_SHA}" + else + BASE_SHA="" + echo "::warning::Image has no .git directory — cannot extract base commit" + fi + echo "BASE_SHA=${BASE_SHA}" >> "$GITHUB_ENV" + + - name: Generate patches + run: | + git config --global --add safe.directory "$GITHUB_WORKSPACE" + git fetch origin main + mkdir -p /tmp/patch-ctx + + if [ -n "${{ inputs.pr_numbers }}" ]; then + IFS=',' read -ra PRS <<< "${{ inputs.pr_numbers }}" + for pr in "${PRS[@]}"; do + pr=$(echo "${pr}" | xargs) + echo "Fetching PR #${pr}" + git fetch origin "pull/${pr}/head:pr-${pr}" + MERGE_BASE=$(git merge-base origin/main "pr-${pr}") + echo " PR #${pr}: merge-base=${MERGE_BASE}" + git diff "${MERGE_BASE}..pr-${pr}" > "/tmp/patch-ctx/${pr}.patch" + echo " PR #${pr}: $(wc -l < /tmp/patch-ctx/${pr}.patch) lines" + done + elif [ -n "${BASE_SHA}" ]; then + echo "Generating diff: image ${BASE_SHA} → latest main" + git fetch origin "${BASE_SHA}" + git diff "${BASE_SHA}..origin/main" > /tmp/patch-ctx/main.patch + echo " main: $(wc -l < /tmp/patch-ctx/main.patch) lines" + else + echo "::error::No PR numbers specified and image has no .git — cannot generate diff against main" + exit 1 + fi + + TOTAL=$(cat /tmp/patch-ctx/*.patch | wc -l) + if [ "${TOTAL}" -eq 0 ]; then + echo "::warning::All patches are empty — image is already up to date" + echo "SKIP_BUILD=true" >> "$GITHUB_ENV" + fi + + - name: Build patched image + if: env.SKIP_BUILD != 'true' + run: | + IMAGE="lmsysorg/sglang:${{ inputs.image_tag }}" + + cat <<'DOCKERFILE' > /tmp/patch-ctx/Dockerfile + ARG BASE_IMAGE + FROM ${BASE_IMAGE} + COPY *.patch /tmp/patches/ + RUN cd /sgl-workspace/sglang \ + && for p in /tmp/patches/*.patch; do \ + if [ ! -s "${p}" ]; then \ + echo "Skipping ${p} (empty)"; \ + else \ + echo "Applying ${p}..." \ + && patch -p1 --fuzz=2 --no-backup-if-mismatch -f < "${p}" \ + || { echo "ERROR: Failed to apply ${p}"; exit 1; }; \ + fi; \ + done \ + && rm -rf /tmp/patches + DOCKERFILE + + docker build \ + --no-cache \ + --build-arg BASE_IMAGE="${IMAGE}" \ + -t "${IMAGE}" \ + /tmp/patch-ctx/ + + - name: Push patched image + if: env.SKIP_BUILD != 'true' + run: | + IMAGE="lmsysorg/sglang:${{ inputs.image_tag }}" + docker push "${IMAGE}" + + echo "### Patched \`${IMAGE}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Base commit:** \`${BASE_SHA:-unknown (no .git)}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Source:** ${{ inputs.pr_numbers && format('PRs: {0}', inputs.pr_numbers) || 'latest main' }}" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml new file mode 100644 index 000000000000..4489ac74a82c --- /dev/null +++ b/.github/workflows/pr-test-amd-rocm720.yml @@ -0,0 +1,1026 @@ +name: PR Test ROCm 7.2 (AMD) +# Dynamic run-name for /rerun-stage commands to enable URL lookup +# Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs +run-name: ${{ (inputs.target_stage || inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage || inputs.target_stage_select, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage || inputs.target_stage_select)) || '' }} + +on: + # run rocm 720 pr tests once a day at 2am UTC to avoid overwhelming the CI system + schedule: + - cron: '0 2 * * *' + # push: + # branches: [ main ] + # paths: + # - "python/**" + # - "scripts/ci/**" + # - "test/**" + # - "sgl-kernel/**" + # - ".github/workflows/pr-test-amd-rocm720.yml" + # - "docker/rocm.Dockerfile" + # pull_request: + # branches: [ main ] + # paths: + # - "python/**" + # - "scripts/ci/**" + # - "test/**" + # - "sgl-kernel/**" + # - ".github/workflows/pr-test-amd-rocm720.yml" + # - "docker/rocm.Dockerfile" + workflow_dispatch: + inputs: + target_stage_select: + description: "Select a stage to run from dropdown (leave empty for auto-detect)" + required: false + type: choice + default: '' + options: + - '' + - sgl-kernel-unit-test-amd + - sgl-kernel-unit-test-2-gpu-amd + - stage-a-test-1-amd + - jit-kernel-unit-test-amd + - stage-b-test-small-1-gpu-amd + - stage-b-test-small-1-gpu-amd-nondeterministic + - stage-b-test-small-1-gpu-amd-mi35x + - stage-b-test-large-1-gpu-amd + - stage-b-test-large-2-gpu-amd + - multimodal-gen-test-1-gpu-amd + - multimodal-gen-test-2-gpu-amd + - stage-c-test-large-8-gpu-amd + - stage-c-test-large-8-gpu-amd-mi35x + - stage-b-test-large-8-gpu-disaggregation-amd + target_stage: + description: "Or type comma-separated stage names (overrides dropdown if non-empty)" + required: false + type: string + default: "" + pr_head_sha: + description: "PR head SHA to checkout (for /rerun-stage on fork PRs)" + required: false + type: string + default: "" + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + workflow_call: + inputs: + ref: + description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' + required: false + type: string + default: '' + run_all_tests: + description: "Run all tests (for releasing or testing purpose)" + required: false + type: boolean + default: false + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + +env: + AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} + +concurrency: + # When called via workflow_call with run_all_tests=true, use a unique group per run to + # avoid collisions with direct schedule/workflow_dispatch triggers. We use run_all_tests + # (not github.event_name) to detect this, because github.event_name inherits from the caller. + group: pr-test-amd-rocm720-${{ inputs.run_all_tests && format('full-{0}', github.run_id) || inputs.pr_head_sha || inputs.ref || github.ref }} + cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' }} + +jobs: + call-gate: + uses: ./.github/workflows/pr-gate.yml + secrets: inherit + check-changes: + needs: [call-gate] + runs-on: ubuntu-latest + outputs: + main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} + sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }} + jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} + multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Determine run mode + id: run-mode + run: | + # Run all tests for workflow_call (when ref input is provided) + # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref + if [[ "${{ inputs.run_all_tests }}" == "true" ]]; then + echo "run_all_tests=true" >> $GITHUB_OUTPUT + echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }})" + else + echo "run_all_tests=false" >> $GITHUB_OUTPUT + echo "Run mode: FILTERED (triggered by ${{ github.event_name }})" + fi + + - name: Detect file changes + id: filter + uses: dorny/paths-filter@v3 + if: steps.run-mode.outputs.run_all_tests != 'true' + with: + filters: | + main_package: + - "python/sglang/!(multimodal_gen)/**" + - "python/pyproject_rocm.toml" + - "python/pyproject_other.toml" + - "scripts/ci/amd/*" + - "scripts/ci/utils/*" + - "test/**" + - ".github/workflows/pr-test-amd-rocm720.yml" + sgl_kernel: + - "sgl-kernel/**" + - ".github/workflows/pr-test-amd-rocm720.yml" + jit_kernel: + - "python/sglang/jit_kernel/**" + - ".github/workflows/pr-test-amd-rocm720.yml" + multimodal_gen: + - "python/sglang/multimodal_gen/**" + - "python/sglang/cli/**" + - "python/sglang/jit_kernel/diffusion/**" + - "python/pyproject_rocm.toml" + - "python/pyproject_other.toml" + + # =============================================== sgl-kernel ==================================================== + sgl-kernel-unit-test-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + needs.check-changes.outputs.sgl_kernel == 'true' + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-1gpu-sglang] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 14 + run: | + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py + docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py + + sgl-kernel-unit-test-2-gpu-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + needs.check-changes.outputs.sgl_kernel == 'true' + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-2gpu-sglang] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 20 + run: | + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_deterministic_custom_allreduce.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_nccl_allreduce_determinism.py + + # =============================================== primary ==================================================== + + stage-a-test-1-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-a-test-1-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-1gpu-sglang] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 10 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + jit-kernel-unit-test-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',jit-kernel-unit-test-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + needs.check-changes.outputs.jit_kernel == 'true' + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-1gpu-sglang] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run JIT kernel unit tests + timeout-minutes: 10 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout" python3 -m pytest -q python/sglang/jit_kernel/tests/test_store_cache.py + + stage-b-test-small-1-gpu-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-1gpu-sglang] + part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + stage-b-test-small-1-gpu-amd-nondeterministic: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd-nondeterministic,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-1gpu-sglang] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + stage-b-test-small-1-gpu-amd-mi35x: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd-mi35x,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi35x-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + stage-b-test-large-1-gpu-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-1-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-1gpu-sglang] + part: [0, 1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + stage-b-test-large-2-gpu-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-2-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-2gpu-sglang] + part: [0, 1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + multimodal-gen-test-1-gpu-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT + matrix: + runner: [linux-mi325-1gpu-sglang] + part: [0, 1] # 2 partitions: 11 tests ÷ 2 = ~5-6 tests each + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Download artifacts + if: needs.check-changes.outputs.sgl_kernel == 'true' + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-python3.10-cuda12.9 + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion + docker exec ci_sglang pip install amdsmi + + - name: Setup kernel caches + run: | + # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) + # This directory persists across container restarts on the self-hosted runner + docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub + + # Clear pre-built AITER kernels from Docker image to avoid segfaults + # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ + echo "Clearing pre-built AITER kernels from Docker image..." + docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true + docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true + echo "AITER kernels cleared - will be rebuilt on first use" + + # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) + # This tells the test cleanup code to NOT delete downloaded models + if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then + docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache + echo "Created .persistent_cache marker - HF cache will persist" + else + echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" + fi + + # Check MIOpen cache (VAE convolution kernels) + miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") + echo "Found ${miopen_files} MIOpen cache files" + + - name: Diagnose HF cache and system resources + run: | + echo "=== System Memory Status ===" + free -h + echo "" + echo "=== Disk Space ===" + df -h /home/runner/sgl-data 2>/dev/null || df -h + echo "" + echo "=== HF Cache Directory Structure ===" + docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" + docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" + echo "" + echo "=== Checking for cached diffusion models (1-GPU tests) ===" + # Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2 + for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do + cache_path="/sgl-data/hf-cache/hub/models--${model}" + if docker exec ci_sglang test -d "$cache_path"; then + size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) + echo "✓ CACHED: $model ($size)" + else + echo "✗ NOT CACHED: $model" + fi + done + echo "" + echo "=== GPU Memory Status ===" + docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" + + - name: Run diffusion server tests (1-GPU) + timeout-minutes: 60 + run: | + # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path) + # Tests: T2V, T2I, I2V, LoRA + # + # HF download env vars: + # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) + # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings + docker exec \ + -e SGLANG_E2E_TOLERANCE=0.3 \ + -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ + -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ + -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ + -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ + -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ + -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ + -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ + -e HF_HUB_ENABLE_HF_TRANSFER=1 \ + -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ + -w /sglang-checkout/python \ + ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ + --suite 1-gpu \ + --partition-id ${{ matrix.part }} \ + --total-partitions 2 \ + -k "not flux_2" + + # Post-test diagnostics + echo "=== Post-test System Memory Status ===" + free -h + + multimodal-gen-test-2-gpu-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT + matrix: + runner: [linux-mi325-2gpu-sglang] + part: [0, 1] # 2 partitions: 9 tests ÷ 2 = ~4-5 tests each + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Download artifacts + if: needs.check-changes.outputs.sgl_kernel == 'true' + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-python3.10-cuda12.9 + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion + docker exec ci_sglang pip install amdsmi + + - name: Setup kernel caches + run: | + # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) + docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub + + # Clear pre-built AITER kernels from Docker image to avoid segfaults + # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ + echo "Clearing pre-built AITER kernels from Docker image..." + docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true + docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true + echo "AITER kernels cleared - will be rebuilt on first use" + + # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) + # This tells the test cleanup code to NOT delete downloaded models + if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then + docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache + echo "Created .persistent_cache marker - HF cache will persist" + else + echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" + fi + + # Check MIOpen cache (VAE convolution kernels) + miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") + echo "Found ${miopen_files} MIOpen cache files" + + - name: Diagnose HF cache and system resources + run: | + echo "=== System Memory Status ===" + free -h + echo "" + echo "=== Disk Space ===" + df -h /home/runner/sgl-data 2>/dev/null || df -h + echo "" + echo "=== HF Cache Directory Structure ===" + docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" + docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" + echo "" + echo "=== Checking for cached diffusion models (2-GPU tests) ===" + # Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1 + for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do + cache_path="/sgl-data/hf-cache/hub/models--${model}" + if docker exec ci_sglang test -d "$cache_path"; then + size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) + echo "✓ CACHED: $model ($size)" + else + echo "✗ NOT CACHED: $model" + fi + done + echo "" + echo "=== GPU Memory Status ===" + docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" + + - name: Run diffusion server tests (2-GPU) + timeout-minutes: 80 + run: | + # AMD CI: All 2-GPU tests including LoRA + # Tests: T2V, T2I, I2V, LoRA + # + # HF download env vars: + # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) + # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings + docker exec \ + -e SGLANG_E2E_TOLERANCE=0.3 \ + -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ + -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ + -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ + -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ + -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ + -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ + -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ + -e HF_HUB_ENABLE_HF_TRANSFER=1 \ + -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ + -w /sglang-checkout/python \ + ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ + --suite 2-gpu \ + --partition-id ${{ matrix.part }} \ + --total-partitions 2 + + # Post-test diagnostics + echo "=== Post-test System Memory Status ===" + free -h + + + stage-c-test-large-8-gpu-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + env: + RUNNER_LABELS: linux-mi325-8gpu-sglang + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-8gpu-sglang] + part: [0, 1, 2] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Test RCCL multi-GPU communication + timeout-minutes: 5 + run: | + echo "Testing RCCL multi-GPU communication with debug info..." + docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py" + + - name: Run test + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + stage-c-test-large-8-gpu-amd-mi35x: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi35x-gpu-8] + part: [0] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + - name: Run test + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + # =============================================== Disaggregation ==================================================== + stage-b-test-large-8-gpu-35x-disaggregation-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-8-gpu-disaggregation-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi35x-gpu-8.fabric] + + runs-on: ${{matrix.runner}} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Check Host RDMA Environment + id: rdma_detect + run: | + set +e + echo "=== Checking Host RDMA Environment ===" + + echo "" + echo "=== 1. Ionic driver library check ===" + ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path" + + echo "" + echo "=== 2. Infiniband devices ===" + ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found" + ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found" + + echo "" + echo "=== 3. ibv_devinfo ===" + which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available" + + echo "" + echo "=== 4. Kernel modules ===" + lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded" + + echo "" + echo "=== 5. Detect RDMA Devices for test environment ===" + if [ -d "/sys/class/infiniband" ]; then + RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -) + echo "Detected RDMA Devices: $RDMA_DEVS" + echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV + else + echo "No RDMA devices found in /sys/class/infiniband" + echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV + fi + + echo "" + echo "=== Host RDMA Check Complete ===" + + - name: Start Special Container + run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify RDMA in Container + run: | + docker exec -u root ci_sglang bash -c ' + echo "=== Container RDMA Verification ===" + echo "Device nodes:" + ls -la /dev/infiniband/ + echo "" + echo "Provider libraries:" + ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers" + echo "" + echo "HCA devices:" + HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0") + ibv_devinfo -list + if [ "$HCA_COUNT" -gt 0 ]; then + echo "" + echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ===" + else + echo "" + echo "=== WARNING: No HCAs detected. RDMA tests may fail ===" + fi + ' + + - name: Run Aiter Op Test (RMSNorm) + timeout-minutes: 10 + run: | + echo "Running pre-check: test_rmsnorm2d.py" + docker exec \ + -e MAX_JOBS=192 \ + ci_sglang \ + python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py + + - name: Run test_disaggregation + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh \ + -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \ + -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + pr-test-amd-finish: + needs: + [ + call-gate, + check-changes, + + sgl-kernel-unit-test-amd, + sgl-kernel-unit-test-2-gpu-amd, + multimodal-gen-test-1-gpu-amd, + multimodal-gen-test-2-gpu-amd, + + stage-a-test-1-amd, + jit-kernel-unit-test-amd, + stage-b-test-small-1-gpu-amd, + stage-b-test-small-1-gpu-amd-nondeterministic, + stage-b-test-small-1-gpu-amd-mi35x, + stage-b-test-large-1-gpu-amd, + stage-b-test-large-2-gpu-amd, + stage-b-test-large-8-gpu-35x-disaggregation-amd, + stage-c-test-large-8-gpu-amd, + stage-c-test-large-8-gpu-amd-mi35x, + ] + if: always() + runs-on: ubuntu-latest + steps: + - name: Check all dependent job statuses + run: | + # Convert the 'needs' context to a JSON string + json_needs='${{ toJson(needs) }}' + + # Get a list of all job names from the JSON keys + job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') + + for job in $job_names; do + # For each job, extract its result + result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') + + # Print the job name and its result + echo "$job: $result" + + # Check for failure or cancellation and exit if found + if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then + echo "The above jobs failed." + exit 1 + fi + done + + # If the loop completes, all jobs were successful + echo "All jobs completed successfully" + exit 0 diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index df1b0bed4744..1aa7b419b45b 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -1,7 +1,7 @@ name: PR Test (AMD) # Dynamic run-name for /rerun-stage commands to enable URL lookup # Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs -run-name: ${{ inputs.target_stage && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage)) || '' }} +run-name: ${{ (inputs.target_stage || inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage || inputs.target_stage_select, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage || inputs.target_stage_select)) || '' }} on: push: @@ -24,8 +24,29 @@ on: - "docker/rocm.Dockerfile" workflow_dispatch: inputs: + target_stage_select: + description: "Select a stage to run from dropdown (leave empty for auto-detect)" + required: false + type: choice + default: '' + options: + - '' + - sgl-kernel-unit-test-amd + - sgl-kernel-unit-test-2-gpu-amd + - stage-a-test-1-amd + - jit-kernel-unit-test-amd + - stage-b-test-small-1-gpu-amd + - stage-b-test-small-1-gpu-amd-nondeterministic + - stage-b-test-small-1-gpu-amd-mi35x + - stage-b-test-large-1-gpu-amd + - stage-b-test-large-2-gpu-amd + - multimodal-gen-test-1-gpu-amd + - multimodal-gen-test-2-gpu-amd + - stage-c-test-large-8-gpu-amd + - stage-c-test-large-8-gpu-amd-mi35x + - stage-b-test-large-8-gpu-disaggregation-amd target_stage: - description: "Specific stage to run (optional, for quick testing)" + description: "Or type comma-separated stage names (overrides dropdown if non-empty)" required: false type: string default: "" @@ -34,6 +55,16 @@ on: required: false type: string default: "" + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: false workflow_call: inputs: ref: @@ -46,11 +77,26 @@ on: required: false type: boolean default: false + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: false + +env: + AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} concurrency: - # Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs - group: pr-test-amd-${{ inputs.pr_head_sha || inputs.ref || github.ref }} - cancel-in-progress: ${{ github.event_name != 'workflow_call' }} + # When called via workflow_call with run_all_tests=true, use a unique group per run to + # avoid collisions with direct push/PR triggers. We use run_all_tests (not github.event_name) + # to detect this, because github.event_name inherits from the caller in workflow_call. + group: pr-test-amd-${{ inputs.run_all_tests && format('full-{0}', github.run_id) || inputs.pr_head_sha || inputs.ref || github.ref }} + cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' }} jobs: call-gate: @@ -62,6 +108,7 @@ jobs: outputs: main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }} + jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} steps: - name: Checkout code @@ -99,9 +146,13 @@ jobs: sgl_kernel: - "sgl-kernel/**" - ".github/workflows/pr-test-amd.yml" + jit_kernel: + - "python/sglang/jit_kernel/**" + - ".github/workflows/pr-test-amd.yml" multimodal_gen: - "python/sglang/multimodal_gen/**" - "python/sglang/cli/**" + - "python/sglang/jit_kernel/diffusion/**" - "python/pyproject_rocm.toml" - "python/pyproject_other.toml" @@ -111,16 +162,16 @@ jobs: if: | always() && ( - (inputs.target_stage == 'sgl-kernel-unit-test-amd') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-amd,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && needs.check-changes.outputs.sgl_kernel == 'true' ) ) strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi325-1gpu-sglang] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -158,16 +209,16 @@ jobs: if: | always() && ( - (inputs.target_stage == 'sgl-kernel-unit-test-2-gpu-amd') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && needs.check-changes.outputs.sgl_kernel == 'true' ) ) strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-2] + runner: [linux-mi325-2gpu-sglang] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -200,9 +251,9 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-a-test-1-amd') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-a-test-1-amd,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) @@ -210,7 +261,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi325-1gpu-sglang] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -233,16 +284,55 @@ jobs: - name: Run test timeout-minutes: 10 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + jit-kernel-unit-test-amd: + needs: [check-changes] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',jit-kernel-unit-test-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + needs.check-changes.outputs.jit_kernel == 'true' + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-1gpu-sglang] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Run JIT kernel unit tests + timeout-minutes: 10 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout" python3 -m pytest -q python/sglang/jit_kernel/tests/test_store_cache.py stage-b-test-small-1-gpu-amd: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-amd') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) @@ -250,8 +340,8 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] - part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + runner: [linux-mi325-1gpu-sglang] + part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -273,16 +363,55 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + + stage-b-test-small-1-gpu-amd-nondeterministic: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd-nondeterministic,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-1gpu-sglang] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-mi35x') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd-mi35x,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) @@ -312,16 +441,16 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-large-1-gpu-amd: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( - (inputs.target_stage == 'stage-b-test-large-1-gpu-amd') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-1-gpu-amd,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) @@ -329,7 +458,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi325-1gpu-sglang] part: [0, 1] runs-on: ${{matrix.runner}} steps: @@ -352,16 +481,16 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-large-2-gpu-amd: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( - (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-2-gpu-amd,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) @@ -369,7 +498,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-2] + runner: [linux-mi325-2gpu-sglang] part: [0, 1] runs-on: ${{matrix.runner}} steps: @@ -392,16 +521,24 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} multimodal-gen-test-1-gpu-amd: needs: [check-changes] - if: needs.check-changes.outputs.multimodal_gen == 'true' + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + needs.check-changes.outputs.multimodal_gen == 'true' + ) + ) strategy: fail-fast: false max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi325-1gpu-sglang] part: [0, 1] # 2 partitions: 11 tests ÷ 2 = ~5-6 tests each runs-on: ${{matrix.runner}} steps: @@ -484,7 +621,7 @@ jobs: docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" - name: Run diffusion server tests (1-GPU) - timeout-minutes: 45 + timeout-minutes: 70 run: | # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path) # Tests: T2V, T2I, I2V, LoRA @@ -516,12 +653,20 @@ jobs: multimodal-gen-test-2-gpu-amd: needs: [check-changes] - if: needs.check-changes.outputs.multimodal_gen == 'true' + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd,')) || + ( + !(inputs.target_stage || inputs.target_stage_select) && + needs.check-changes.outputs.multimodal_gen == 'true' + ) + ) strategy: fail-fast: false max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT matrix: - runner: [linux-mi325-gpu-2] + runner: [linux-mi325-2gpu-sglang] part: [0, 1] # 2 partitions: 9 tests ÷ 2 = ~4-5 tests each runs-on: ${{matrix.runner}} steps: @@ -638,19 +783,19 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-c-test-large-8-gpu-amd') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) env: - RUNNER_LABELS: linux-mi325-gpu-8 + RUNNER_LABELS: linux-mi325-8gpu-sglang strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-8] + runner: [linux-mi325-8gpu-sglang] part: [0, 1, 2] runs-on: ${{matrix.runner}} steps: @@ -679,16 +824,16 @@ jobs: - name: Run test timeout-minutes: 60 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-c-test-large-8-gpu-amd-mi35x: needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd] if: | always() && ( - (inputs.target_stage == 'stage-c-test-large-8-gpu-amd-mi35x') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) @@ -719,7 +864,7 @@ jobs: - name: Run test timeout-minutes: 60 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} # =============================================== Disaggregation ==================================================== stage-b-test-large-8-gpu-35x-disaggregation-amd: @@ -727,9 +872,9 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-large-8-gpu-disaggregation-amd') || + (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-8-gpu-disaggregation-amd,')) || ( - !inputs.target_stage && + !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) @@ -831,7 +976,7 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh \ -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \ - -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 + -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} pr-test-amd-finish: needs: @@ -845,7 +990,9 @@ jobs: multimodal-gen-test-2-gpu-amd, stage-a-test-1-amd, + jit-kernel-unit-test-amd, stage-b-test-small-1-gpu-amd, + stage-b-test-small-1-gpu-amd-nondeterministic, stage-b-test-small-1-gpu-amd-mi35x, stage-b-test-large-1-gpu-amd, stage-b-test-large-2-gpu-amd, diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index 0361d078fc4f..31844f18fb83 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -28,8 +28,9 @@ jobs: check-changes: runs-on: ubuntu-latest outputs: - main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} - multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} + changes_exist: ${{ steps.filter.outputs.main_package == 'true' || steps.filter.outputs.multimodal_gen == 'true' || steps.run-mode.outputs.run_all_tests == 'true'}} + main_package: ${{ steps.filter.outputs.main_package == 'true' || steps.run-mode.outputs.run_all_tests == 'true' }} + multimodal_gen: ${{ steps.filter.outputs.multimodal_gen == 'true' || steps.run-mode.outputs.run_all_tests == 'true' }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -63,14 +64,15 @@ jobs: - ".github/workflows/pr-test-npu.yml" multimodal_gen: - "python/sglang/multimodal_gen/**" + - "python/sglang/srt/**" - "python/pyproject_npu.toml" - - "scripts/ci/npu_ci_install_dependency.sh" + - "scripts/ci/npu/npu_ci_install_dependency.sh" - ".github/workflows/pr-test-npu.yml" # ==================== PR Gate ==================== # pr-gate: needs: check-changes - if: needs.check-changes.outputs.main_package == 'true' + if: needs.check-changes.outputs.changes_exist == 'true' uses: ./.github/workflows/pr-gate.yml secrets: inherit @@ -78,6 +80,10 @@ jobs: needs: [check-changes, pr-gate] if: needs.check-changes.outputs.main_package == 'true' runs-on: linux-aarch64-a2-1 + strategy: + fail-fast: false + matrix: + part: [ 0, 1 ] container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11 steps: @@ -87,13 +93,16 @@ jobs: ref: ${{ inputs.ref || github.ref }} - name: Install dependencies + env: + TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu" + PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple" + GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/" run: | # speed up by using infra cache services CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list pip config set global.index-url http://${CACHING_URL}/pypi/simple - pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple" - pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn" + pip config set global.trusted-host "${CACHING_URL}" bash scripts/ci/npu/npu_ci_install_dependency.sh 910b # copy required file from our daily cache @@ -112,7 +121,7 @@ jobs: STREAMS_PER_DEVICE: 32 run: | cd test/srt - python3 run_suite.py --suite per-commit-1-npu-a2 + python3 run_suite.py --suite per-commit-1-npu-a2 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 per-commit-2-npu-a2: needs: [check-changes, pr-gate] @@ -121,7 +130,7 @@ jobs: strategy: fail-fast: true matrix: - part: [0, 1, 2] + part: [0, 1] container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11 steps: @@ -131,13 +140,16 @@ jobs: ref: ${{ inputs.ref || github.ref }} - name: Install dependencies + env: + TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu" + PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple" + GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/" run: | # speed up by using infra cache services CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list pip config set global.index-url http://${CACHING_URL}/pypi/simple - pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple" - pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn" + pip config set global.trusted-host "${CACHING_URL}" bash scripts/ci/npu/npu_ci_install_dependency.sh 910b # copy required file from our daily cache @@ -156,14 +168,14 @@ jobs: STREAMS_PER_DEVICE: 32 run: | cd test/srt - python3 run_suite.py --suite per-commit-2-npu-a2 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 + python3 run_suite.py --suite per-commit-2-npu-a2 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 - per-commit-4-npu-a2: + per-commit-4-npu-a3: needs: [check-changes, pr-gate] if: needs.check-changes.outputs.main_package == 'true' - runs-on: linux-aarch64-a2-4 + runs-on: linux-aarch64-a3-4 container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11 steps: - name: Checkout code uses: actions/checkout@v4 @@ -171,15 +183,18 @@ jobs: ref: ${{ inputs.ref || github.ref }} - name: Install dependencies + env: + TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu" + PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple" + GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/" run: | # speed up by using infra cache services CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list pip config set global.index-url http://${CACHING_URL}/pypi/simple - pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple" - pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn" + pip config set global.trusted-host "${CACHING_URL}" - bash scripts/ci/npu/npu_ci_install_dependency.sh 910b + bash scripts/ci/npu/npu_ci_install_dependency.sh a3 # copy required file from our daily cache cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp # copy download through proxy @@ -196,16 +211,12 @@ jobs: STREAMS_PER_DEVICE: 32 run: | cd test/srt - python3 run_suite.py --suite per-commit-4-npu-a2 --timeout-per-file 3600 + python3 run_suite.py --suite per-commit-4-npu-a3 --timeout-per-file 3600 per-commit-16-npu-a3: needs: [check-changes, pr-gate] if: needs.check-changes.outputs.main_package == 'true' runs-on: linux-aarch64-a3-16 - strategy: - fail-fast: true - matrix: - part: [0, 1] container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11 steps: @@ -214,6 +225,131 @@ jobs: with: ref: ${{ inputs.ref || github.ref }} + - name: Install dependencies + env: + TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu" + PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple" + GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/" + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.trusted-host "${CACHING_URL}" + + bash scripts/ci/npu/npu_ci_install_dependency.sh a3 + # copy required file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + # copy download through proxy + curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + + - name: Run test + timeout-minutes: 60 + env: + SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true + HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + STREAMS_PER_DEVICE: 32 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-16-npu-a3 --timeout-per-file 3600 + + multimodal-gen-test-1-npu-a3: + needs: [check-changes, pr-gate] + if: needs.check-changes.outputs.multimodal_gen == 'true' + runs-on: linux-aarch64-a3-2 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-a3-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + env: + TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu" + PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple" + GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/" + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.trusted-host "${CACHING_URL}" + + bash scripts/ci/npu/npu_ci_install_dependency.sh a3 + # copy required file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + # copy download through proxy + curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + + - name: Run test + timeout-minutes: 60 + env: + SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true + HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + STREAMS_PER_DEVICE: 32 + run: | + export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}" + cd python + python3 sglang/multimodal_gen/test/run_suite.py --suite 1-npu + + multimodal-gen-test-2-npu-a3: + needs: [check-changes, pr-gate] + if: needs.check-changes.outputs.multimodal_gen == 'true' + runs-on: linux-aarch64-a3-16 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-a3-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + env: + TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu" + PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple" + GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/" + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.trusted-host "${CACHING_URL}" + + bash scripts/ci/npu/npu_ci_install_dependency.sh a3 + # copy required file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + # copy download through proxy + curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + + - name: Run test + timeout-minutes: 60 + env: + SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true + HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + STREAMS_PER_DEVICE: 32 + run: | + export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}" + cd python + python3 sglang/multimodal_gen/test/run_suite.py --suite 2-npu + + multimodal-gen-test-8-npu-a3: + needs: [check-changes, pr-gate] + if: needs.check-changes.outputs.multimodal_gen == 'true' + runs-on: linux-aarch64-a3-16 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Install dependencies run: | # speed up by using infra cache services @@ -239,5 +375,5 @@ jobs: PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" STREAMS_PER_DEVICE: 32 run: | - cd test/srt - python3 run_suite.py --suite per-commit-16-npu-a3 --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 + cd python + python3 sglang/multimodal_gen/test/run_suite.py --suite 8-npu diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2b0ddb8b9bf9..6f0f5f73f0dc 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -52,20 +52,24 @@ on: default: false concurrency: - # Concurrency group structure: pr-test-{branch}-{pr_sha}-{stage} + # Concurrency group structure: pr-test-{event}-{branch}-{pr_sha}-{stage} + # - event_name prevents scheduled runs from colliding with fork PRs whose branch is named 'main' + # (without it, both resolve the branch segment to 'main' and block each other) # - github.head_ref (pull_request) or github.ref_name (workflow_dispatch) normalizes to branch name # - pr_head_sha isolates /rerun-stage from main branch runs # - target_stage allows parallel stage dispatches to run independently - # This ensures pull_request and workflow_dispatch on same branch cancel each other - group: pr-test-${{ github.head_ref || github.ref_name || 'default' }}-${{ inputs.pr_head_sha || 'current' }}-${{ inputs.target_stage || inputs.ref || 'all' }} + group: pr-test-${{ github.event_name }}-${{ github.head_ref || github.ref_name || 'default' }}-${{ inputs.pr_head_sha || 'current' }}-${{ inputs.target_stage || inputs.ref || 'all' }} cancel-in-progress: ${{ github.event_name != 'workflow_call' }} env: SGLANG_IS_IN_CI: true + SGLANG_CUDA_COREDUMP: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true permissions: actions: write contents: read + pull-requests: read jobs: # =============================================== check changes ==================================================== @@ -92,6 +96,10 @@ jobs: with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + - name: Show test partition assignments + continue-on-error: true + run: python3 test/show_partitions.py + - name: Determine run mode id: run-mode run: | @@ -128,6 +136,7 @@ jobs: - ".github/workflows/pr-test.yml" multimodal_gen: - "python/sglang/multimodal_gen/**" + - "python/sglang/jit_kernel/**" - "python/sglang/cli/**" - "python/pyproject.toml" - ".github/workflows/pr-test.yml" @@ -198,6 +207,8 @@ jobs: - name: Set max-parallel based on run type id: set-parallel + env: + GH_TOKEN: ${{ github.token }} run: | # Scheduled runs and high-priority PRs get full parallelism if [[ "${{ github.event_name }}" == "schedule" ]]; then @@ -206,6 +217,27 @@ jobs: elif [[ "${{ github.event_name }}" == "pull_request" && "${{ contains(github.event.pull_request.labels.*.name, 'high priority') }}" == "true" ]]; then echo "max_parallel=14" >> $GITHUB_OUTPUT echo "High priority PR detected, setting max_parallel to 14" + elif [[ -n "${{ inputs.target_stage }}" ]]; then + # /rerun-stage (workflow_dispatch): query PR labels via GitHub API + # Try SHA lookup first (fork PRs), fallback to branch name (non-fork PRs) + LABELS="" + PR_HEAD_SHA="${{ inputs.pr_head_sha }}" + if [[ -n "$PR_HEAD_SHA" ]]; then + LABELS=$(gh api "repos/${{ github.repository }}/commits/${PR_HEAD_SHA}/pulls" \ + --jq '.[0].labels[].name' 2>/dev/null || true) + fi + if [[ -z "$LABELS" ]]; then + LABELS=$(gh pr list --head "${{ github.ref_name }}" --repo "${{ github.repository }}" \ + --json labels --jq '.[0].labels[].name' 2>/dev/null || true) + fi + echo "PR labels: ${LABELS:-"(none)"}" + if echo "$LABELS" | grep -Fxq "high priority"; then + echo "max_parallel=14" >> $GITHUB_OUTPUT + echo "High priority PR detected via API (/rerun-stage), setting max_parallel to 14" + else + echo "max_parallel=3" >> $GITHUB_OUTPUT + echo "Using default max_parallel of 3 (/rerun-stage, no high priority label)" + fi else echo "max_parallel=3" >> $GITHUB_OUTPUT echo "Using default max_parallel of 3" @@ -848,6 +880,9 @@ jobs: # temporarily put backend-independent cpu tests here python3 run_suite.py --hw cpu --suite default $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + stage-a-cpu-only: needs: [check-changes, call-gate] if: | @@ -937,7 +972,7 @@ jobs: CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh git clone https://github.com/merrymercy/human-eval.git cd human-eval - pip install -e . + pip install -e . --no-build-isolation - name: Run test timeout-minutes: 30 @@ -950,6 +985,11 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 8 $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.partition }} + # Runs on H100 (80GB, SM90) - tests that don't pass on 5090 (FA3, FP8, high VRAM, etc.) stage-b-test-large-1-gpu: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] @@ -1001,6 +1041,11 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 14 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.partition }} + stage-b-test-large-2-gpu: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] if: | @@ -1041,7 +1086,7 @@ jobs: CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh git clone https://github.com/merrymercy/human-eval.git cd human-eval - pip install -e . + pip install -e . --no-build-isolation - name: Run test timeout-minutes: 30 @@ -1053,6 +1098,11 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 4 $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.partition }} + stage-b-test-4-gpu-b200: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] if: | @@ -1106,91 +1156,8 @@ jobs: run: | IS_BLACKWELL=1 python3 -m pytest -q python/sglang/jit_kernel/tests/test_flash_attention_4.py - stage-c-test-large-4-gpu: - needs: [check-changes, call-gate, wait-for-stage-b, sgl-kernel-build-wheels] - if: | - always() && - ( - (inputs.target_stage == 'stage-c-test-large-4-gpu') || - ( - !inputs.target_stage && - ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: 4-gpu-h100 - timeout-minutes: 240 - env: - RUNNER_LABELS: 4-gpu-h100 - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 20 - run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/ - CONTINUE_ON_ERROR_FLAG="" - if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then - CONTINUE_ON_ERROR_FLAG="--continue-on-error" - fi - python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu $CONTINUE_ON_ERROR_FLAG - - stage-c-test-large-4-gpu-b200: - needs: [check-changes, call-gate, wait-for-stage-b, sgl-kernel-build-wheels] - if: | - always() && - ( - (inputs.target_stage == 'stage-c-test-large-4-gpu-b200') || - ( - !inputs.target_stage && - ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: ${{ needs.check-changes.outputs.b200_runner }} - timeout-minutes: 240 - env: - RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v6 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 20 - run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/cuda/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/ - IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu-b200 + - uses: ./.github/actions/upload-cuda-coredumps + if: always() multimodal-gen-test-1-gpu: needs: [check-changes, call-gate, sgl-kernel-build-wheels] @@ -1244,6 +1211,10 @@ jobs: --total-partitions 2 \ $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.part }} multimodal-gen-test-2-gpu: needs: [check-changes, call-gate, sgl-kernel-build-wheels] @@ -1298,6 +1269,11 @@ jobs: --total-partitions 2 \ $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.part }} + stage-c-test-4-gpu-h100: needs: [check-changes, call-gate, wait-for-stage-b] if: | @@ -1347,6 +1323,11 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-h100 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.part }} + stage-c-test-8-gpu-h200: needs: [check-changes, call-gate, wait-for-stage-b] if: | @@ -1386,14 +1367,22 @@ jobs: run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - # - name: Warmup Weights and JIT Compilation - # timeout-minutes: 20 - # run: | - # # An example command for testing the warmup. TODO: make this more general and move them to python scripts. - # python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code + - name: Warmup DeepGEMM JIT Compilation + timeout-minutes: 25 + run: | + python3 scripts/ci/cuda/warmup_deep_gemm.py \ + deepseek-ai/DeepSeek-V3-0324:8 \ + deepseek-ai/DeepSeek-V3.2-Exp:8 + + - name: Warmup Server CUDA Graphs + timeout-minutes: 25 + run: | + python3 scripts/ci/cuda/warmup_server.py \ + deepseek-ai/DeepSeek-V3-0324:8 \ + inclusionAI/Ring-2.5-1T:8 - name: Run test - timeout-minutes: 20 + timeout-minutes: 30 run: | cd test CONTINUE_ON_ERROR_FLAG="" @@ -1402,6 +1391,11 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-c-test-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.part }} + stage-c-test-8-gpu-h20: needs: [check-changes, call-gate, wait-for-stage-b] if: | @@ -1452,6 +1446,11 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-c-test-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + with: + artifact-suffix: ${{ matrix.part }} + stage-c-test-deepep-4-gpu: needs: [check-changes, call-gate, wait-for-stage-b] if: | @@ -1487,6 +1486,18 @@ jobs: run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_deepep.sh + - name: Warmup DeepGEMM JIT Compilation + timeout-minutes: 25 + run: | + python3 scripts/ci/cuda/warmup_deep_gemm.py \ + lmsys/sglang-ci-dsv3-test:4 + + - name: Warmup Server CUDA Graphs + timeout-minutes: 25 + run: | + python3 scripts/ci/cuda/warmup_server.py \ + lmsys/sglang-ci-dsv3-test:4 + - name: Run test timeout-minutes: 20 run: | @@ -1497,6 +1508,9 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-c-test-deepep-4-gpu $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + stage-c-test-deepep-8-gpu-h200: needs: [check-changes, call-gate, wait-for-stage-b] if: | @@ -1532,6 +1546,19 @@ jobs: run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_deepep.sh + - name: Warmup DeepGEMM JIT Compilation + timeout-minutes: 25 + run: | + python3 scripts/ci/cuda/warmup_deep_gemm.py \ + deepseek-ai/DeepSeek-V3-0324:8 \ + deepseek-ai/DeepSeek-V3.2-Exp:8 + + - name: Warmup Server CUDA Graphs + timeout-minutes: 25 + run: | + python3 scripts/ci/cuda/warmup_server.py \ + deepseek-ai/DeepSeek-V3-0324:8 + - name: Run test timeout-minutes: 45 run: | @@ -1542,6 +1569,9 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-c-test-deepep-8-gpu-h200 $CONTINUE_ON_ERROR_FLAG + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + stage-c-test-4-gpu-b200: needs: [check-changes, call-gate, wait-for-stage-b] if: | @@ -1592,52 +1622,62 @@ jobs: fi IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-b200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG - stage-c-test-4-gpu-gb200: - needs: [check-changes, call-gate, wait-for-stage-b, sgl-kernel-build-wheels-arm] - if: | - always() && - ( - (inputs.target_stage == 'stage-c-test-4-gpu-gb200') || - ( - !inputs.target_stage && - ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: 4-gpu-gb200 - timeout-minutes: 240 - env: - RUNNER_LABELS: 4-gpu-gb200 - strategy: - fail-fast: false - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 + - uses: ./.github/actions/upload-cuda-coredumps + if: always() with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9-aarch64 - - - name: Install dependencies - timeout-minutes: 20 - run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 GRACE_BLACKWELL=1 bash scripts/ci/cuda/ci_install_deepep.sh - - - name: Run test - timeout-minutes: 45 - run: | - cd test - CONTINUE_ON_ERROR_FLAG="" - if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then - CONTINUE_ON_ERROR_FLAG="--continue-on-error" - fi - python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-gb200 --timeout-per-file 3600 $CONTINUE_ON_ERROR_FLAG + artifact-suffix: ${{ matrix.part }} + + # NOTE: GB200 stage temporarily disabled — no company-owned GB200 runner available yet. + # Re-enable when a 4-gpu-gb200 runner is provisioned. + # stage-c-test-4-gpu-gb200: + # needs: [check-changes, call-gate, wait-for-stage-b, sgl-kernel-build-wheels-arm] + # if: | + # always() && + # ( + # (inputs.target_stage == 'stage-c-test-4-gpu-gb200') || + # ( + # !inputs.target_stage && + # ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && + # ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + # ) + # ) + # runs-on: 4-gpu-gb200 + # timeout-minutes: 240 + # env: + # RUNNER_LABELS: 4-gpu-gb200 + # strategy: + # fail-fast: false + # steps: + # - name: Checkout code + # uses: actions/checkout@v4 + # with: + # ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + # + # - name: Download artifacts + # if: needs.check-changes.outputs.sgl_kernel == 'true' + # uses: actions/download-artifact@v4 + # with: + # path: sgl-kernel/dist/ + # merge-multiple: true + # pattern: wheel-python3.10-cuda12.9-aarch64 + # + # - name: Install dependencies + # timeout-minutes: 20 + # run: | + # CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 GRACE_BLACKWELL=1 bash scripts/ci/cuda/ci_install_deepep.sh + # + # - name: Run test + # timeout-minutes: 45 + # run: | + # cd test + # CONTINUE_ON_ERROR_FLAG="" + # if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then + # CONTINUE_ON_ERROR_FLAG="--continue-on-error" + # fi + # python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-gb200 --timeout-per-file 3600 $CONTINUE_ON_ERROR_FLAG + # + # - uses: ./.github/actions/upload-cuda-coredumps + # if: always() pr-test-finish: needs: @@ -1664,7 +1704,6 @@ jobs: stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, - stage-c-test-large-4-gpu, stage-b-test-4-gpu-b200, stage-c-test-4-gpu-h100, stage-c-test-8-gpu-h20, @@ -1672,7 +1711,7 @@ jobs: stage-c-test-deepep-4-gpu, stage-c-test-deepep-8-gpu-h200, stage-c-test-4-gpu-b200, - stage-c-test-4-gpu-gb200, + # stage-c-test-4-gpu-gb200, # Temporarily disabled — no GB200 runner ] if: always() runs-on: ubuntu-latest diff --git a/.github/workflows/release-branch-cut.yml b/.github/workflows/release-branch-cut.yml index f39a8c5c688a..d5d796d09292 100644 --- a/.github/workflows/release-branch-cut.yml +++ b/.github/workflows/release-branch-cut.yml @@ -16,6 +16,7 @@ on: permissions: actions: write contents: write + pull-requests: read jobs: cut-release-branch: @@ -85,7 +86,7 @@ jobs: echo "Branch '$BRANCH_NAME' does not exist, proceeding with creation" - - name: Create and push release branch + - name: Create release branch id: set_output run: | COMMIT_SHA="${{ steps.validate.outputs.COMMIT_SHA }}" @@ -97,11 +98,33 @@ jobs: # Create branch from the specified commit git checkout -b "$BRANCH_NAME" "$COMMIT_SHA" - # Push the new branch - git push origin "$BRANCH_NAME" - echo "branch_name=$BRANCH_NAME" >> $GITHUB_OUTPUT - echo "Successfully created and pushed branch '$BRANCH_NAME' from commit '$COMMIT_SHA'" + echo "Successfully created branch '$BRANCH_NAME' from commit '$COMMIT_SHA'" + + - name: Update version references in documentation + run: | + BRANCH_NAME="${{ github.event.inputs.branch_name }}" + # Extract version from branch name (e.g., release/v0.5.8 -> v0.5.8) + VERSION=$(echo "$BRANCH_NAME" | sed 's/release\///') + + # Update git clone version references in docs + sed -i "s/git clone -b v[0-9]\+\.[0-9]\+\.[0-9]\+\.\?post\?[0-9]*/git clone -b $VERSION/" docs/get_started/install.md + sed -i "s/git clone -b v[0-9]\+\.[0-9]\+\.[0-9]\+\.\?post\?[0-9]*/git clone -b $VERSION/" docs/platforms/amd_gpu.md + + # Check if any changes were made + if git diff --quiet; then + echo "No version references needed updating" + else + git add docs/get_started/install.md docs/platforms/amd_gpu.md + git commit -m "docs: update version references to $VERSION" + echo "Updated version references to $VERSION" + fi + + - name: Push release branch + run: | + BRANCH_NAME="${{ steps.set_output.outputs.branch_name }}" + git push origin "$BRANCH_NAME" + echo "Successfully pushed branch '$BRANCH_NAME'" - name: Summary run: | diff --git a/.github/workflows/release-docker-amd-nightly.yml b/.github/workflows/release-docker-amd-nightly.yml index 2ca68ec29d9f..b07d473e9bb0 100644 --- a/.github/workflows/release-docker-amd-nightly.yml +++ b/.github/workflows/release-docker-amd-nightly.yml @@ -2,7 +2,7 @@ name: Release Docker Images Nightly (AMD) on: workflow_dispatch: schedule: - - cron: '0 13 * * *' + - cron: '0 12 * * *' concurrency: # A PR number if a pull request and otherwise the commit hash. This cancels @@ -78,7 +78,7 @@ jobs: tag=v${version}-${rocm_tag} - docker build . -f docker/rocm.Dockerfile --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic --build-arg SETUPTOOLS_SCM_PRETEND_VERSION=${pretend_version} -t rocm/sgl-dev:${tag}-${{ env.DATE }} --no-cache + docker build . -f docker/rocm.Dockerfile --build-arg SGL_BRANCH=${{ github.ref_name }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic --build-arg SETUPTOOLS_SCM_PRETEND_VERSION=${pretend_version} -t rocm/sgl-dev:${tag}-${{ env.DATE }} --no-cache docker push rocm/sgl-dev:${tag}-${{ env.DATE }} # Temporarily disable docker cache seeding until performant storage is in place diff --git a/.github/workflows/release-docker-amd-rocm720-nightly.yml b/.github/workflows/release-docker-amd-rocm720-nightly.yml new file mode 100644 index 000000000000..376817f8fa9f --- /dev/null +++ b/.github/workflows/release-docker-amd-rocm720-nightly.yml @@ -0,0 +1,82 @@ +name: Release Docker Images ROCm 7.2.0 Nightly Preview (AMD) +on: + workflow_dispatch: + schedule: + - cron: '0 12 * * *' + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: True + +jobs: + publish: + if: github.repository == 'sgl-project/sglang' + runs-on: amd-docker-scale + environment: 'prod' + strategy: + fail-fast: false + matrix: + gpu_arch: ['gfx942-rocm720', 'gfx950-rocm720'] + build_type: ['all'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Required for git describe to find tags + + - name: "Set Date" + run: | + echo "DATE=$(date +%Y%m%d)" >> $GITHUB_ENV + + - name: Get version from latest tag + id: version + run: | + # Get the latest version tag sorted by version number (e.g., v0.5.7 -> 0.5.7) + VERSION=$(git tag -l 'v[0-9]*' --sort=-v:refname | head -1 | sed 's/^v//') + + if [ -z "$VERSION" ]; then + echo "::error::Could not determine version from git tags" + exit 1 + fi + + # Get short commit hash of current HEAD + COMMIT_HASH=$(git rev-parse --short HEAD) + + # Compose pretend version for setuptools_scm: e.g., 0.5.8.post1.dev20260211+g1a2b3c4 + PRETEND_VERSION="${VERSION}.dev${{ env.DATE }}+g${COMMIT_HASH}" + + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "pretend_version=${PRETEND_VERSION}" >> $GITHUB_OUTPUT + echo "Detected version: ${VERSION}" + echo "Pretend version for pip: ${PRETEND_VERSION}" + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_AMD_USERNAME }} + password: ${{ secrets.DOCKERHUB_AMD_TOKEN }} + + - name: Build and Push + run: | + version=${{ steps.version.outputs.version }} + pretend_version=${{ steps.version.outputs.pretend_version }} + echo "Version: ${version}" + echo "Pretend version: ${pretend_version}" + + if [ "${{ matrix.gpu_arch }}" = "gfx942-rocm720" ]; then + rocm_tag="rocm720-mi30x" + elif [ "${{ matrix.gpu_arch }}" = "gfx950-rocm720" ]; then + rocm_tag="rocm720-mi35x" + else + echo "Unsupported gfx arch" + exit 1 + fi + + tag=v${version}-${rocm_tag} + + docker build . -f docker/rocm.Dockerfile --build-arg SGL_BRANCH=${{ github.ref_name }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic --build-arg SETUPTOOLS_SCM_PRETEND_VERSION=${pretend_version} -t rocm/sgl-dev:${tag}-${{ env.DATE }} --no-cache + docker push rocm/sgl-dev:${tag}-${{ env.DATE }} diff --git a/.github/workflows/release-docker-amd.yml b/.github/workflows/release-docker-amd.yml index 2f2b613670d8..28c0edc15e5a 100644 --- a/.github/workflows/release-docker-amd.yml +++ b/.github/workflows/release-docker-amd.yml @@ -16,6 +16,7 @@ jobs: environment: 'prod' strategy: matrix: + rocm_version: ['rocm700', 'rocm720'] gpu_arch: ['gfx942', 'gfx950'] build_type: ['all'] steps: @@ -55,17 +56,33 @@ jobs: version=${{ steps.version.outputs.version }} echo "Version: ${version}" - if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then - rocm_tag="rocm700-mi30x" - elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then - rocm_tag="rocm700-mi35x" + gpu_arch_suffix="" + if [ "${{ matrix.rocm_version }}" = "rocm700" ]; then + if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then + rocm_tag="rocm700-mi30x" + elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then + rocm_tag="rocm700-mi35x" + else + echo "Unsupported gfx arch" + exit 1 + fi + elif [ "${{ matrix.rocm_version }}" = "rocm720" ]; then + gpu_arch_suffix="-${{ matrix.rocm_version }}" + if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then + rocm_tag="rocm720-mi30x" + elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then + rocm_tag="rocm720-mi35x" + else + echo "Unsupported gfx arch" + exit 1 + fi else - echo "Unsupported gfx arch" + echo "Unsupported rocm version" exit 1 fi tag=v${version}-${rocm_tag} # rocm.Dockerfile expects SGL_BRANCH with 'v' prefix for git tag checkout - docker build . -f docker/rocm.Dockerfile --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} --build-arg SGL_BRANCH=v${version} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t lmsysorg/sglang:${tag} --no-cache + docker build . -f docker/rocm.Dockerfile --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }}${gpu_arch_suffix} --build-arg SGL_BRANCH=v${version} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t lmsysorg/sglang:${tag} --no-cache docker push lmsysorg/sglang:${tag} diff --git a/.github/workflows/release-docker-cu13-framework.yml b/.github/workflows/release-docker-cu13-framework.yml index 8674c28f8d87..a0800b7fe455 100644 --- a/.github/workflows/release-docker-cu13-framework.yml +++ b/.github/workflows/release-docker-cu13-framework.yml @@ -9,11 +9,6 @@ on: version: description: "Version to build (without v prefix, e.g., 0.5.8)" required: true - flashinfer_version: - description: "FlashInfer version (default: 0.6.1)" - required: false - default: "0.6.1" - jobs: publish-x86: if: github.repository == 'sgl-project/sglang' @@ -72,7 +67,6 @@ jobs: --build-arg CUDA_VERSION=13.0.1 \ --build-arg BUILD_TYPE=all \ --build-arg INSTALL_FLASHINFER_JIT_CACHE=1 \ - --build-arg FLASHINFER_VERSION=${{ github.event.inputs.flashinfer_version }} \ --build-arg GRACE_BLACKWELL=0 \ --build-arg SGL_VERSION=${version} \ -t lmsysorg/sglang:${tag} \ @@ -125,7 +119,6 @@ jobs: --build-arg CUDA_VERSION=13.0.1 \ --build-arg BUILD_TYPE=all \ --build-arg INSTALL_FLASHINFER_JIT_CACHE=1 \ - --build-arg FLASHINFER_VERSION=${{ github.event.inputs.flashinfer_version }} \ --build-arg GRACE_BLACKWELL=1 \ --build-arg SGL_VERSION=${version} \ -t lmsysorg/sglang:${tag} \ diff --git a/.github/workflows/release-docker-cu13.yml b/.github/workflows/release-docker-cu13.yml deleted file mode 100644 index aa23483331ec..000000000000 --- a/.github/workflows/release-docker-cu13.yml +++ /dev/null @@ -1,122 +0,0 @@ -name: Build and Push CUDA 13 Docker Images - -# release this manually via workflow_dispatch for now -on: - workflow_dispatch: - schedule: - - cron: "0 0 * * *" -jobs: - build-dev: - if: ${{ github.repository == 'sgl-project/sglang' }} - runs-on: ${{ matrix.runner }} - strategy: - matrix: - include: - - runner: x64-docker-build-node - platform: linux/amd64 - build_type: all - grace_blackwell: 0 - tag: dev-x86-cu13 - version: 13.0.1 - - runner: arm-docker-build-node - platform: linux/arm64 - build_type: all - grace_blackwell: 1 - tag: dev-arm64-cu13 - version: 13.0.1 - steps: - - name: Delete huge unnecessary tools folder - run: rm -rf /opt/hostedtoolcache - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Free disk space - uses: jlumbroso/free-disk-space@main - with: - tool-cache: true - docker-images: true - android: true - dotnet: true - haskell: true - large-packages: true - swap-storage: true - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and Push Dev Image - run: | - docker buildx build \ - --platform ${{ matrix.platform }} \ - --push \ - --target framework \ - -f docker/Dockerfile \ - --build-arg CUDA_VERSION=${{ matrix.version }} \ - --build-arg BUILD_TYPE=${{ matrix.build_type }} \ - --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) \ - --build-arg GRACE_BLACKWELL=${{ matrix.grace_blackwell }} \ - --build-arg USE_LATEST_SGLANG=1 \ - --build-arg INSTALL_FLASHINFER_JIT_CACHE=1 \ - -t lmsysorg/sglang:${{ matrix.tag }} \ - --no-cache \ - . - - create-manifests: - runs-on: ubuntu-22.04 - needs: [build-dev] - if: ${{ github.repository == 'sgl-project/sglang' }} - strategy: - matrix: - variant: - - tag: dev-cu13 - x86_tag: dev-x86-cu13 - arm64_tag: dev-arm64-cu13 - steps: - - uses: docker/setup-buildx-action@v3 - - - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - run: | - docker buildx imagetools create \ - -t lmsysorg/sglang:${{ matrix.variant.tag }} \ - -t lmsysorg/sglang:nightly-${{ matrix.variant.tag }}-$(date +%Y%m%d)-${GITHUB_SHA:0:8} \ - lmsysorg/sglang:${{ matrix.variant.x86_tag }} \ - lmsysorg/sglang:${{ matrix.variant.arm64_tag }} - - - name: Cleanup Old Nightly Builds - run: | - # Get JWT token for Docker Hub API - TOKEN=$(curl -s -H "Content-Type: application/json" -X POST -d '{"username": "${{ secrets.DOCKERHUB_USERNAME }}", "password": "${{ secrets.DOCKERHUB_TOKEN }}"}' https://hub.docker.com/v2/users/login/ | jq -r .token) - - # Get all tags for the repository - TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" "https://hub.docker.com/v2/repositories/lmsysorg/sglang/tags/?page_size=100") - - # Extract tags that match our pattern and sort by last_updated timestamp (most recent first) - TAGS=$(echo "$TAGS_RESPONSE" | jq -r '.results[] | select(.name | startswith("nightly-${{ matrix.variant.tag }}-")) | "\(.last_updated)|\(.name)"' | sort -r | cut -d'|' -f2) - - # Count total tags and keep only the 14 most recent - TAG_COUNT=$(echo "$TAGS" | wc -l) - if [ "$TAG_COUNT" -gt 14 ]; then - echo "Found $TAG_COUNT nightly builds, keeping only the 14 most recent" - TAGS_TO_DELETE=$(echo "$TAGS" | tail -n +15) - echo "Tags to delete: $TAGS_TO_DELETE" - - # Delete old tags - for tag in $TAGS_TO_DELETE; do - echo "Deleting tag: $tag" - curl -X DELETE \ - -H "Authorization: JWT $TOKEN" \ - "https://hub.docker.com/v2/repositories/lmsysorg/sglang/tags/$tag/" - done - else - echo "Only $TAG_COUNT nightly builds found, no cleanup needed" - fi diff --git a/.github/workflows/release-docker-dev-pr.yml b/.github/workflows/release-docker-dev-pr.yml deleted file mode 100644 index 08323008cc3b..000000000000 --- a/.github/workflows/release-docker-dev-pr.yml +++ /dev/null @@ -1,116 +0,0 @@ -name: Build PR Development Docker Images - -on: - workflow_dispatch: - inputs: - pr_number: - description: 'PR number to build from' - required: true - type: string - pr_branch: - description: 'PR branch name to build from (e.g., my-feature-branch or refs/pull/123/head)' - required: true - type: string - -concurrency: - group: release-docker-dev-pr-${{ github.event.inputs.pr_number }} - cancel-in-progress: true - -jobs: - build-dev: - if: ${{ github.repository == 'sgl-project/sglang' }} - environment: "prod" - runs-on: ${{ matrix.runner }} - strategy: - matrix: - include: - - runner: x64-docker-build-node - platform: linux/amd64 - build_type: all - grace_blackwell: 0 - arch_tag: x86 - version: 12.9.1 - - runner: arm-docker-build-node - platform: linux/arm64 - build_type: all - grace_blackwell: 1 - arch_tag: arm64 - version: 12.9.1 - steps: - - name: Delete huge unnecessary tools folder - run: rm -rf /opt/hostedtoolcache - - - name: Checkout repository - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_branch }} - - - name: Free disk space - uses: jlumbroso/free-disk-space@main - with: - tool-cache: true - docker-images: true - android: true - dotnet: true - haskell: true - large-packages: true - swap-storage: true - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and Push Dev Image - run: | - tag=dev-${{ matrix.arch_tag }}-pr-${{ inputs.pr_number }} - - docker buildx build \ - --platform ${{ matrix.platform }} \ - --push \ - -f docker/Dockerfile \ - --target framework \ - --build-arg CUDA_VERSION=${{ matrix.version }} \ - --build-arg BUILD_TYPE=${{ matrix.build_type }} \ - --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) \ - --build-arg GRACE_BLACKWELL=${{ matrix.grace_blackwell }} \ - --build-arg BRANCH_TYPE=local \ - --build-arg INSTALL_FLASHINFER_JIT_CACHE=1 \ - -t lmsysorg/sglang:${tag} \ - --no-cache \ - . - - create-manifests: - runs-on: ubuntu-22.04 - needs: [build-dev] - if: ${{ github.repository == 'sgl-project/sglang' }} - environment: "prod" - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Create multi-arch manifest - run: | - # Create PR dev manifest - docker buildx imagetools create \ - -t lmsysorg/sglang:dev-pr-${{ inputs.pr_number }} \ - lmsysorg/sglang:dev-x86-pr-${{ inputs.pr_number }} \ - lmsysorg/sglang:dev-arm64-pr-${{ inputs.pr_number }} - - echo "✓ Built Docker image: lmsysorg/sglang:dev-pr-${{ inputs.pr_number }}" - echo "" - echo "Usage:" - echo " docker pull lmsysorg/sglang:dev-pr-${{ inputs.pr_number }}" diff --git a/.github/workflows/release-docker-dev.yml b/.github/workflows/release-docker-dev.yml index 19a17e21ece8..18dbed5734c9 100644 --- a/.github/workflows/release-docker-dev.yml +++ b/.github/workflows/release-docker-dev.yml @@ -2,9 +2,22 @@ name: Build and Push Development Docker Images on: workflow_dispatch: + inputs: + pr_number: + description: "PR number to build from (leave empty to use current branch)" + required: false + default: "" + tag: + description: "Custom tag suffix (overrides pr_number in tag). E.g. 'my-test' → dev-x86-my-test, dev-cu13-my-test, etc." + required: false + default: "" schedule: - cron: "0 0 * * *" +concurrency: + group: release-docker-dev-${{ inputs.tag || inputs.pr_number || 'nightly' }} + cancel-in-progress: true + jobs: build-dev: if: ${{ github.repository == 'sgl-project/sglang' }} @@ -16,20 +29,34 @@ jobs: platform: linux/amd64 build_type: all grace_blackwell: 0 - tag: dev-x86 + arch_tag: x86 version: 12.9.1 - runner: arm-docker-build-node platform: linux/arm64 build_type: all grace_blackwell: 1 - tag: dev-arm64 + arch_tag: arm64 version: 12.9.1 + - runner: x64-docker-build-node + platform: linux/amd64 + build_type: all + grace_blackwell: 0 + arch_tag: x86-cu13 + version: 13.0.1 + - runner: arm-docker-build-node + platform: linux/arm64 + build_type: all + grace_blackwell: 1 + arch_tag: arm64-cu13 + version: 13.0.1 steps: - name: Delete huge unnecessary tools folder run: rm -rf /opt/hostedtoolcache - name: Checkout repository uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_number && format('refs/pull/{0}/head', inputs.pr_number) || github.ref }} - name: Free disk space uses: jlumbroso/free-disk-space@main @@ -42,6 +69,12 @@ jobs: large-packages: true swap-storage: true + - name: Prune Docker to reclaim disk space + run: | + docker buildx prune --filter "until=72h" -f + docker system prune -af --filter "until=72h" + docker volume prune -af + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -53,18 +86,37 @@ jobs: - name: Build and Push Dev Image run: | + # Tag suffix: custom tag > pr number > none + SUFFIX="" + if [ -n "${{ inputs.tag }}" ]; then + SUFFIX="-${{ inputs.tag }}" + elif [ -n "${{ inputs.pr_number }}" ]; then + SUFFIX="-pr-${{ inputs.pr_number }}" + fi + + TAG="dev-${{ matrix.arch_tag }}${SUFFIX}" + + # Nightly (schedule) installs latest release; manual dispatch builds from checked-out source + if [ "${{ github.event_name }}" = "schedule" ]; then + SOURCE_ARG="--build-arg USE_LATEST_SGLANG=1" + else + SOURCE_ARG="--build-arg BRANCH_TYPE=local" + fi + + echo "Building lmsysorg/sglang:${TAG}" + docker buildx build \ --platform ${{ matrix.platform }} \ --push \ - -f docker/Dockerfile \ --target framework \ + -f docker/Dockerfile \ --build-arg CUDA_VERSION=${{ matrix.version }} \ --build-arg BUILD_TYPE=${{ matrix.build_type }} \ --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) \ --build-arg GRACE_BLACKWELL=${{ matrix.grace_blackwell }} \ - --build-arg USE_LATEST_SGLANG=1 \ + ${SOURCE_ARG} \ --build-arg INSTALL_FLASHINFER_JIT_CACHE=1 \ - -t lmsysorg/sglang:${{ matrix.tag }} \ + -t lmsysorg/sglang:${TAG} \ --no-cache \ . @@ -75,9 +127,12 @@ jobs: strategy: matrix: variant: - - tag: dev - x86_tag: dev-x86 - arm64_tag: dev-arm64 + - base: dev + x86: x86 + arm64: arm64 + - base: dev-cu13 + x86: x86-cu13 + arm64: arm64-cu13 steps: - uses: docker/setup-buildx-action@v3 @@ -85,37 +140,56 @@ jobs: with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - run: | - SHORT_SHA="${{ github.sha }}" + + - name: Create multi-arch manifest + run: | + SUFFIX="" + if [ -n "${{ inputs.tag }}" ]; then + SUFFIX="-${{ inputs.tag }}" + elif [ -n "${{ inputs.pr_number }}" ]; then + SUFFIX="-pr-${{ inputs.pr_number }}" + fi + + TAG="${{ matrix.variant.base }}${SUFFIX}" + X86_TAG="dev-${{ matrix.variant.x86 }}${SUFFIX}" + ARM64_TAG="dev-${{ matrix.variant.arm64 }}${SUFFIX}" + + # For nightly (no suffix), also stamp a dated tag + EXTRA_TAG="" + if [ -z "${SUFFIX}" ]; then + SHORT_SHA="${{ github.sha }}" + EXTRA_TAG="-t lmsysorg/sglang:nightly-${TAG}-$(date +%Y%m%d)-${SHORT_SHA:0:8}" + fi + docker buildx imagetools create \ - -t lmsysorg/sglang:${{ matrix.variant.tag }} \ - -t lmsysorg/sglang:nightly-${{ matrix.variant.tag }}-$(date +%Y%m%d)-${SHORT_SHA:0:8} \ - lmsysorg/sglang:${{ matrix.variant.x86_tag }} \ - lmsysorg/sglang:${{ matrix.variant.arm64_tag }} + -t lmsysorg/sglang:${TAG} \ + ${EXTRA_TAG} \ + lmsysorg/sglang:${X86_TAG} \ + lmsysorg/sglang:${ARM64_TAG} + + echo "✓ Published lmsysorg/sglang:${TAG}" - name: Cleanup Old Nightly Builds + if: ${{ !inputs.tag && !inputs.pr_number }} run: | - # Get JWT token for Docker Hub API - TOKEN=$(curl -s -H "Content-Type: application/json" -X POST -d '{"username": "${{ secrets.DOCKERHUB_USERNAME }}", "password": "${{ secrets.DOCKERHUB_TOKEN }}"}' https://hub.docker.com/v2/users/login/ | jq -r .token) + TOKEN=$(curl -s -H "Content-Type: application/json" \ + -X POST -d '{"username": "${{ secrets.DOCKERHUB_USERNAME }}", "password": "${{ secrets.DOCKERHUB_TOKEN }}"}' \ + https://hub.docker.com/v2/users/login/ | jq -r .token) - # Get all tags for the repository - TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" "https://hub.docker.com/v2/repositories/lmsysorg/sglang/tags/?page_size=100") + TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" \ + "https://hub.docker.com/v2/repositories/lmsysorg/sglang/tags/?page_size=100") - # Extract tags that match our pattern and sort by last_updated timestamp (most recent first) - TAGS=$(echo "$TAGS_RESPONSE" | jq -r '.results[] | select(.name | startswith("nightly-${{ matrix.variant.tag }}-")) | "\(.last_updated)|\(.name)"' | sort -r | cut -d'|' -f2) + TAGS=$(echo "$TAGS_RESPONSE" | jq -r \ + '.results[] | select(.name | test("^nightly-${{ matrix.variant.base }}-[0-9]")) | "\(.last_updated)|\(.name)"' \ + | sort -r | cut -d'|' -f2) - # Count total tags and keep only the 14 most recent TAG_COUNT=$(echo "$TAGS" | wc -l) if [ "$TAG_COUNT" -gt 14 ]; then echo "Found $TAG_COUNT nightly builds, keeping only the 14 most recent" TAGS_TO_DELETE=$(echo "$TAGS" | tail -n +15) - echo "Tags to delete: $TAGS_TO_DELETE" - - # Delete old tags for tag in $TAGS_TO_DELETE; do echo "Deleting tag: $tag" - curl -X DELETE \ - -H "Authorization: JWT $TOKEN" \ + curl -X DELETE -H "Authorization: JWT $TOKEN" \ "https://hub.docker.com/v2/repositories/lmsysorg/sglang/tags/$tag/" done else diff --git a/.github/workflows/release-docker-npu-nightly.yml b/.github/workflows/release-docker-npu-nightly.yml index df5db619dba8..fe49016a651b 100644 --- a/.github/workflows/release-docker-npu-nightly.yml +++ b/.github/workflows/release-docker-npu-nightly.yml @@ -1,5 +1,11 @@ name: Release Docker Images Nightly (NPU) on: + pull_request: + branches: + - 'main' + paths: + - '.github/workflows/release-docker-npu-nightly.yml' + - 'docker/npu.Dockerfile' workflow_dispatch: schedule: - cron: "0 0 * * *" @@ -74,6 +80,6 @@ jobs: push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }} provenance: false build-args: | - SGLANG_KERNEL_NPU_TAG=2026.01.28 + SGLANG_KERNEL_NPU_TAG=2026.02.01.post2 CANN_VERSION=${{ matrix.cann_version }} DEVICE_TYPE=${{ matrix.device_type }} diff --git a/.github/workflows/release-docker-npu.yml b/.github/workflows/release-docker-npu.yml index 12f96094925d..77a7bfe2c269 100644 --- a/.github/workflows/release-docker-npu.yml +++ b/.github/workflows/release-docker-npu.yml @@ -87,7 +87,7 @@ jobs: push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }} provenance: false build-args: | - SGLANG_KERNEL_NPU_TAG=2026.01.28 + SGLANG_KERNEL_NPU_TAG=2026.02.01.post2 CANN_VERSION=${{ matrix.cann_version }} DEVICE_TYPE=${{ matrix.device_type }} SGLANG_TAG=${{ steps.version.outputs.version }} diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index fc725d88b126..18fa650f4f8b 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -1,6 +1,8 @@ name: Release Documentation on: + release: + types: [published] push: branches: - main @@ -14,6 +16,9 @@ concurrency: group: release-docs-${{ github.ref }} cancel-in-progress: true +env: + SGLANG_IS_IN_CI: true + jobs: execute-and-deploy: runs-on: 1-gpu-runner @@ -22,6 +27,11 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Fetch full git history for release index + if: github.event_name == 'release' + run: | + git fetch --prune --unshallow || git fetch --prune --depth=0 + - name: Install dependencies run: | bash scripts/ci/cuda/ci_install_dependency.sh @@ -50,10 +60,23 @@ jobs: make markdown python3 wrap_run_llm.py + if [[ "${{ github.event_name }}" == "release" ]]; then + python3 release_lookup/generate_index.py --output release_lookup/release_index.json + + # Copy release lookup tool for official docs on published releases. + mkdir -p _build/html/release_lookup + cp release_lookup/index.html _build/html/release_lookup/ + cp release_lookup/release_index.json _build/html/release_lookup/ + fi + cd _build/html git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1 - find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -not -name ".jekyll" -not -name ".nojekyll" -delete + if [[ "${{ github.event_name }}" == "release" ]]; then + find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -not -name ".jekyll" -not -name ".nojekyll" -delete + else + find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -path "../sgl-project.github.io/release_lookup*" -not -name CNAME -not -name ".jekyll" -not -name ".nojekyll" -delete + fi cp -r * ../sgl-project.github.io cp ../../README.md ../sgl-project.github.io/README.md cd ../sgl-project.github.io diff --git a/.github/workflows/release-pypi-nightly.yml b/.github/workflows/release-pypi-nightly.yml index 3246da699700..edc058bed1af 100644 --- a/.github/workflows/release-pypi-nightly.yml +++ b/.github/workflows/release-pypi-nightly.yml @@ -54,28 +54,26 @@ jobs: cd python cp ../README.md ../LICENSE . - # Parse git describe output to detect exact tag builds (distance=0) + # Parse git describe output to get latest tag # Use same command as pyproject.toml to ensure version consistency DESC=$(git tag --list --sort=-version:refname 'v*.*.*' | head -1 | xargs git describe --tags --long 2>/dev/null || echo 'v0.0.0-0-g0000000') - DIST=$(echo "$DESC" | cut -d- -f2) - - # If building at exact tag (distance=0), force dev0 version for unique wheel names - if [ "$DIST" = "0" ]; then - TAG=$(echo "$DESC" | cut -d- -f1) - HASH="g$(git rev-parse --short HEAD)" - - # Increment patch version for nightlies (e.g., v0.5.8 -> 0.5.9.dev0) - VERSION=${TAG#v} # Remove 'v' prefix - MAJOR=$(echo "$VERSION" | cut -d. -f1) - MINOR=$(echo "$VERSION" | cut -d. -f2) - PATCH=$(echo "$VERSION" | cut -d. -f3) - NEXT_PATCH=$((PATCH + 1)) - NEXT_VERSION="${MAJOR}.${MINOR}.${NEXT_PATCH}" - - FORCE_VERSION="${NEXT_VERSION}.dev0+${HASH}" - echo "Building at exact tag $TAG, forcing nightly version to: $FORCE_VERSION" - export SETUPTOOLS_SCM_PRETEND_VERSION="$FORCE_VERSION" - fi + TAG=$(echo "$DESC" | cut -d- -f1) + HASH="g$(git rev-parse --short HEAD)" + BUILD_DATE=$(date -u +%Y%m%d) + + # Increment patch version for nightlies (e.g., v0.5.8 -> 0.5.9) + VERSION=${TAG#v} # Remove 'v' prefix + MAJOR=$(echo "$VERSION" | cut -d. -f1) + MINOR=$(echo "$VERSION" | cut -d. -f2) + PATCH=$(echo "$VERSION" | cut -d. -f3) + NEXT_PATCH=$((PATCH + 1)) + NEXT_VERSION="${MAJOR}.${MINOR}.${NEXT_PATCH}" + + # Use date-based dev number for correct chronological sorting + # e.g., 0.5.9.dev20260215+g4cf4f0859 > 0.5.9.dev20260214+g45a4697d4 + FORCE_VERSION="${NEXT_VERSION}.dev${BUILD_DATE}+${HASH}" + echo "Forcing nightly version to: $FORCE_VERSION" + export SETUPTOOLS_SCM_PRETEND_VERSION="$FORCE_VERSION" # Build wheel python3 -m build --wheel diff --git a/.github/workflows/release-pypi-pr.yml b/.github/workflows/release-pypi-pr.yml index deff4665c574..d14be109f6ed 100644 --- a/.github/workflows/release-pypi-pr.yml +++ b/.github/workflows/release-pypi-pr.yml @@ -4,11 +4,7 @@ on: workflow_dispatch: inputs: pr_number: - description: 'PR number to build wheel for' - required: true - type: string - pr_branch: - description: 'PR branch name to build from (e.g., my-feature-branch or refs/pull/123/head)' + description: 'PR number to build wheel for (works with both internal and fork PRs)' required: true type: string @@ -27,7 +23,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - ref: ${{ inputs.pr_branch }} + ref: refs/pull/${{ inputs.pr_number }}/head fetch-depth: 0 # Need full history for version generation - name: Set up Python @@ -38,13 +34,14 @@ jobs: - name: Generate PR wheel version id: gen_version run: | - # Get base version from setuptools_scm - cd python - pip install setuptools-scm - FULL_VERSION=$(python -c "from setuptools_scm import get_version; print(get_version(root='..'))") - # Strip any existing .dev or + suffix to get clean base version - BASE_VERSION=$(echo "$FULL_VERSION" | sed 's/\.dev.*//;s/+.*//') - cd .. + # Get base version from the latest v*.*.* git tag directly + # Note: We cannot use setuptools_scm here because the [tool.setuptools_scm] + # config (with custom git_describe_command) lives in python/pyproject.toml, + # not at the repo root. Without that config, setuptools_scm falls back to + # default git describe which finds gateway-* tags instead of v*.*.* release tags. + LATEST_TAG=$(git tag --list --sort=-version:refname 'v*.*.*' | head -1) + BASE_VERSION=${LATEST_TAG#v} + echo "Latest release tag: ${LATEST_TAG}" # Get commit info COMMIT_HASH=$(git rev-parse --short HEAD) diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml index 2fe1e8aefa50..9e6cb37177a6 100644 --- a/.github/workflows/release-whl-kernel.yml +++ b/.github/workflows/release-whl-kernel.yml @@ -8,9 +8,24 @@ on: - sgl-kernel/python/sgl_kernel/version.py workflow_dispatch: inputs: + target: + type: choice + description: 'Build target' + required: false + default: 'all' + options: + - 'all' + - 'cu129' + - 'cu130' + - 'rocm700' + - 'musa43' tag_name: type: string required: false + pr_number: + description: "PR number to build from (e.g. 12345)" + type: string + required: false concurrency: group: release-sglang-kernels-${{ github.ref }} @@ -18,7 +33,9 @@ concurrency: jobs: build-cu129-matrix: - if: github.repository == 'sgl-project/sglang' + if: | + github.repository == 'sgl-project/sglang' && + (github.event.inputs.target == 'all' || github.event.inputs.target == 'cu129') strategy: matrix: python-version: ["3.10"] @@ -34,6 +51,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: "recursive" + ref: ${{ inputs.pr_number && format('refs/pull/{0}/head', inputs.pr_number) || '' }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -46,8 +64,8 @@ jobs: chmod +x ./build.sh ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" ${{ matrix.arch == 'aarch64' && 'aarch64' || '' }} env: - USE_CCACHE: 0 - CMAKE_EXTRA_ARGS: ${{ matrix.arch == 'aarch64' && '-DENABLE_BELOW_SM90=ON' || '' }} + BUILD_JOBS: 64 + NVCC_THREADS: 8 - name: Upload to PyPI working-directory: sgl-kernel @@ -66,6 +84,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_number && format('refs/pull/{0}/head', inputs.pr_number) || '' }} - name: Download artifacts uses: actions/download-artifact@v4 @@ -112,7 +132,9 @@ jobs: # for now we do not release CUDA 13.0 wheels to pypi build-cu130-matrix: - if: github.repository == 'sgl-project/sglang' + if: | + github.repository == 'sgl-project/sglang' && + (github.event.inputs.target == 'all' || github.event.inputs.target == 'cu130') strategy: matrix: python-version: ["3.10"] @@ -128,6 +150,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: "recursive" + ref: ${{ inputs.pr_number && format('refs/pull/{0}/head', inputs.pr_number) || '' }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -140,7 +163,8 @@ jobs: chmod +x ./build.sh ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" ${{ matrix.arch == 'aarch64' && 'aarch64' || '' }} env: - USE_CCACHE: 0 + BUILD_JOBS: 64 + NVCC_THREADS: 8 - name: Upload artifacts uses: actions/upload-artifact@v4 @@ -153,6 +177,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_number && format('refs/pull/{0}/head', inputs.pr_number) || '' }} - name: Download artifacts uses: actions/download-artifact@v4 @@ -198,7 +224,9 @@ jobs: git push build-rocm700: - if: github.repository == 'sgl-project/sglang' + if: | + github.repository == 'sgl-project/sglang' && + (github.event.inputs.target == 'all' || github.event.inputs.target == 'rocm700') runs-on: amd-docker-scale strategy: matrix: @@ -208,6 +236,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: "recursive" + ref: ${{ inputs.pr_number && format('refs/pull/{0}/head', inputs.pr_number) || '' }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -232,6 +261,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_number && format('refs/pull/{0}/head', inputs.pr_number) || '' }} - name: Download artifacts uses: actions/download-artifact@v4 @@ -275,3 +306,82 @@ jobs: git add -A git commit -m "update whl index" git push + + build-musa43: + if: | + github.repository == 'sgl-project/sglang' && + (github.event.inputs.target == 'all' || github.event.inputs.target == 'musa43') + runs-on: kernel-build-node-musa + strategy: + matrix: + python-version: ["3.10"] + musa-version: ["43"] + steps: + - uses: actions/checkout@v4 + with: + submodules: "recursive" + + - name: Build wheels + run: | + cd sgl-kernel + mv pyproject_musa.toml pyproject.toml + python setup_musa.py sdist bdist_wheel + + - name: Rename MUSA wheels + run: | + bash scripts/ci/musa/rename_wheels_musa.sh ${{ matrix.musa-version }} sgl-kernel/dist + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: wheel-python${{ matrix.python-version }}-musa${{ matrix.musa-version }} + path: sgl-kernel/dist/* + + release-musa43: + needs: build-musa43 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-* + + - name: Set tag name + id: set_tag_name + run: | + if [ -z "${{ inputs.tag_name }}" ]; then + TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)" + echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT + else + echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT + fi + + - name: Release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ steps.set_tag_name.outputs.tag_name }} + repository: sgl-project/whl + token: ${{ secrets.GH_PAT_FOR_WHL_RELEASE }} + files: | + sgl-kernel/dist/* + + - name: Clone wheel index + run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl + env: + WHL_TOKEN: ${{ secrets.GH_PAT_FOR_WHL_RELEASE }} + + - name: Update wheel index + run: python3 scripts/update_kernel_whl_index.py --musa 43 + + - name: Push wheel index + run: | + cd sgl-whl + git config --local user.name "sglang-bot" + git config --local user.email "sglangbot@gmail.com" + git add -A + git commit -m "update whl index" + git push diff --git a/.github/workflows/rerun-ut.yml b/.github/workflows/rerun-ut.yml new file mode 100644 index 000000000000..0e1cf7379da7 --- /dev/null +++ b/.github/workflows/rerun-ut.yml @@ -0,0 +1,71 @@ +name: Rerun UT +run-name: ${{ inputs.pr_head_sha && format('[rerun-ut] {0}', inputs.pr_head_sha) || '[rerun-ut]' }} + +on: + workflow_dispatch: + inputs: + test_command: + description: "Test command to run (e.g. 'registered/core/test_srt_endpoint.py TestSRTEndpoint.test_simple_decode')" + required: true + type: string + runner_label: + description: "Runner label (e.g. '1-gpu-runner', '1-gpu-5090', '4-gpu-h100')" + required: true + type: string + pr_head_sha: + description: "PR head SHA to checkout (for /rerun-ut on fork PRs)" + required: false + type: string + default: "" + use_deepep: + description: "Use ci_install_deepep.sh instead of ci_install_dependency.sh" + required: false + type: string + default: "false" + +env: + SGLANG_IS_IN_CI: true + SGLANG_CUDA_COREDUMP: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true + +permissions: + actions: write + contents: read + +jobs: + rerun-ut-cuda: + runs-on: ${{ inputs.runner_label }} + timeout-minutes: 120 + env: + RUNNER_LABELS: ${{ inputs.runner_label }} + IS_BLACKWELL: ${{ (inputs.runner_label == '1-gpu-5090' || contains(inputs.runner_label, 'b200')) && '1' || '' }} + SGLANG_CI_RDMA_ALL_DEVICES: ${{ inputs.runner_label == '8-gpu-h20' && 'mlx5_1,mlx5_2,mlx5_3,mlx5_4' || '' }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || github.sha }} + + - name: Install dependencies + timeout-minutes: 20 + run: | + if [[ "${{ inputs.runner_label }}" == "1-gpu-5090" ]]; then + source /etc/profile.d/sglang-ci.sh + fi + if [[ "${{ inputs.use_deepep }}" == "true" ]]; then + bash scripts/ci/cuda/ci_install_deepep.sh + else + bash scripts/ci/cuda/ci_install_dependency.sh + fi + + - name: Run test + timeout-minutes: 60 + run: | + if [[ "${{ inputs.runner_label }}" == "1-gpu-5090" ]]; then + source /etc/profile.d/sglang-ci.sh + fi + cd test/ + python3 ${{ inputs.test_command }} + + - uses: ./.github/actions/upload-cuda-coredumps + if: always() diff --git a/.github/workflows/retag-docker.yml b/.github/workflows/retag-docker.yml new file mode 100644 index 000000000000..633a275ed033 --- /dev/null +++ b/.github/workflows/retag-docker.yml @@ -0,0 +1,30 @@ +name: Retag Docker Image + +on: + workflow_dispatch: + inputs: + source_tag: + description: "Existing image tag (e.g., v0.4.7-cu129-amd64)" + required: true + target_tag: + description: "New tag to apply (e.g., latest)" + required: true + +jobs: + retag: + if: github.repository == 'sgl-project/sglang' + runs-on: ubuntu-22.04 + environment: "prod" + steps: + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Retag image + run: | + echo "Retagging lmsysorg/sglang:${{ inputs.source_tag }} -> lmsysorg/sglang:${{ inputs.target_tag }}" + docker buildx imagetools create \ + -t lmsysorg/sglang:${{ inputs.target_tag }} \ + lmsysorg/sglang:${{ inputs.source_tag }} diff --git a/.github/workflows/slash-command-handler.yml b/.github/workflows/slash-command-handler.yml index 012208f9f271..9411e0798e72 100644 --- a/.github/workflows/slash-command-handler.yml +++ b/.github/workflows/slash-command-handler.yml @@ -19,7 +19,8 @@ jobs: (contains(github.event.comment.body, '/tag-run-ci-label') || contains(github.event.comment.body, '/rerun-failed-ci') || contains(github.event.comment.body, '/tag-and-rerun-ci') || - contains(github.event.comment.body, '/rerun-stage')) + contains(github.event.comment.body, '/rerun-stage') || + contains(github.event.comment.body, '/rerun-ut')) runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore index 00d145ca5546..f25d4e85f81d 100644 --- a/.gitignore +++ b/.gitignore @@ -245,10 +245,10 @@ sgl-model-gateway/tests/fixtures/golden/ lmms-eval -**/.claude/ **/.serena/ ctags/ outputs/ +inputs/ # Eval Cache .longbench_cache/ @@ -262,10 +262,6 @@ outputs/ # setuptools-scm generated version file python/sglang/_version.py -# Generated protobuf files (regenerate during wheel build or with compile_proto.py) -python/sglang/srt/grpc/*_pb2.py -python/sglang/srt/grpc/*_pb2_grpc.py -python/sglang/srt/grpc/*_pb2.pyi # MUSA section # Generated source files by torchada @@ -275,3 +271,9 @@ sgl-kernel/csrc/**/*_musa/ # MUSA core dump files *.mudmp + +# Others +# diffusion 3D outputs +*.glb +*.ply +*.npz diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7abe48029ea7..f088453f37ed 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ exclude: ^(python/sglang/multimodal_gen/csrc|python/sglang/jit_kernel/flash_atte repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-symlinks - id: destroyed-symlinks @@ -21,12 +21,12 @@ repos: - id: debug-statements - id: no-commit-to-branch - repo: https://github.com/PyCQA/isort - rev: 5.13.2 + rev: 7.0.0 hooks: - id: isort exclude: '^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$' - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.7 + rev: v0.15.1 hooks: - id: ruff args: @@ -43,7 +43,7 @@ repos: python/sglang/srt/grpc/.*_pb2_grpc\.pyi$| )$ - repo: https://github.com/psf/black - rev: 24.10.0 + rev: 26.1.0 hooks: - id: black-jupyter exclude: '^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$' @@ -53,13 +53,13 @@ repos: - id: codespell args: ['--config', '.codespellrc'] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.8 + rev: v20.1.7 hooks: - id: clang-format types_or: [c++, cuda] args: [--style=file, --verbose] - repo: https://github.com/kynan/nbstripout - rev: 0.8.1 + rev: 0.9.0 hooks: - id: nbstripout args: @@ -67,6 +67,14 @@ repos: - '--extra-keys=metadata.kernelspec metadata.language_info.version' - repo: local hooks: + - id: check-chinese-characters + name: check chinese characters in multimodal_gen + entry: >- + python3 -c 'import sys, re; p=re.compile(r"[\u4e00-\u9fff]"); ec=0; [ ([(print(f"{f}:{i+1}: {l.strip()}") or (ec:=1)) for i,l in enumerate(open(f, "r", encoding="utf-8", errors="ignore")) if p.search(l)]) for f in sys.argv[1:] ]; sys.exit(ec)' + language: system + files: ^python/sglang/multimodal_gen/.* + exclude: ^(python/sglang/multimodal_gen/configs/sample|python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/workflows|python/sglang/multimodal_gen/runtime/pipelines_core/stages/model_specific_stages)(/|$) + types_or: [python, markdown, json, text] - id: sort-ci-permissions name: sort CI_PERMISSIONS.json entry: python3 .github/update_ci_permission.py --sort-only diff --git a/3rdparty/amd/tuning/TUNING.md b/3rdparty/amd/tuning/TUNING.md index e7b9b2049d61..a903bba03eca 100644 --- a/3rdparty/amd/tuning/TUNING.md +++ b/3rdparty/amd/tuning/TUNING.md @@ -25,7 +25,7 @@ To maximize Triton kernel efficiency, several strategies can be employed: triton.Config({'waves_per_eu': 4}, num_warps=16, num_stages=1), ], key=['BLOCK_N', 'NUM_TOKEN_BLKS'], use_cuda_graph=True) @triton.jit -def _triton_kernel_funtion(): +def _triton_kernel_function(): ... ``` ## 2. Torch Tunable Operations diff --git a/3rdparty/amd/tuning/benchmark_moe_rocm.py b/3rdparty/amd/tuning/benchmark_moe_rocm.py index af596d218310..131b25270ab8 100644 --- a/3rdparty/amd/tuning/benchmark_moe_rocm.py +++ b/3rdparty/amd/tuning/benchmark_moe_rocm.py @@ -187,10 +187,8 @@ def run_grid(bs, model, method, tp_size, dtype: str): configs = union_of_list_of_dicts(prune_configs_1, prune_configs_2) - print( - f"{bs=} || {len(full_configs)=} | {len(prune_configs_1)=} | \ - {len(prune_configs_2)=} | {len(configs)=}" - ) + print(f"{bs=} || {len(full_configs)=} | {len(prune_configs_1)=} | \ + {len(prune_configs_2)=} | {len(configs)=}") best_config = None best_time_us = 1e20 diff --git a/benchmark/asr/README.md b/benchmark/asr/README.md new file mode 100644 index 000000000000..0acbf1c30fae --- /dev/null +++ b/benchmark/asr/README.md @@ -0,0 +1,166 @@ +# ASR Benchmark + +This benchmark evaluates the performance and accuracy (Word Error Rate - WER) of Automatic Speech Recognition (ASR) models served via SGLang. + +## Supported Models + +- `openai/whisper-large-v3` +- `openai/whisper-large-v3-turbo` + +## Setup + +Install the required dependencies: + +```bash +apt install ffmpeg +pip install librosa soundfile datasets evaluate jiwer transformers openai torchcodec torch +``` + +## Running the Benchmark + +### 1. Start SGLang Server + +Launch the SGLang server with a Whisper model: + +```bash +python -m sglang.launch_server --model-path openai/whisper-large-v3 --port 30000 +``` + +### 2. Run the Benchmark Script + +Basic usage (using chat completions API): + +```bash +python bench_sglang.py --base-url http://localhost:30000 --model openai/whisper-large-v3 --n-examples 10 +``` + +Using the OpenAI-compatible transcription API: + +```bash +python bench_sglang.py \ + --base-url http://localhost:30000 \ + --model openai/whisper-large-v3 \ + --api-type transcription \ + --language English \ + --n-examples 10 +``` + +Run with streaming and show real-time output: + +```bash +python bench_sglang.py \ + --base-url http://localhost:30000 \ + --model openai/whisper-large-v3 \ + --api-type transcription \ + --stream \ + --show-predictions \ + --concurrency 1 +``` + +Run with higher concurrency and save results: + +```bash +python bench_sglang.py \ + --base-url http://localhost:30000 \ + --model openai/whisper-large-v3 \ + --concurrency 8 \ + --n-examples 100 \ + --output results.json \ + --show-predictions +``` + +## Arguments + +| Argument | Description | Default | +|----------|-------------|---------| +| `--base-url` | SGLang server URL | `http://localhost:30000` | +| `--model` | Model name on the server | `openai/whisper-large-v3` | +| `--dataset` | HuggingFace dataset for evaluation | `D4nt3/esb-datasets-earnings22-validation-tiny-filtered` | +| `--split` | Dataset split to use | `validation` | +| `--concurrency` | Number of concurrent requests | `4` | +| `--n-examples` | Number of examples to process (`-1` for all) | `-1` | +| `--output` | Path to save results as JSON | `None` | +| `--show-predictions` | Display sample predictions | `False` | +| `--print-n` | Number of samples to display | `5` | +| `--api-type` | API to use: `chat` (chat completions) or `transcription` (audio transcriptions) | `chat` | +| `--language` | Language for transcription API (e.g., `English`, `en`) | `None` | +| `--stream` | Enable streaming mode for transcription API | `False` | + +## Metrics + +The benchmark outputs: + +| Metric | Description | +|--------|-------------| +| **Total Requests** | Number of successful ASR requests processed | +| **WER** | Word Error Rate (lower is better), computed using the `evaluate` library | +| **Average Latency** | Mean time per request (seconds) | +| **Median Latency** | 50th percentile latency (seconds) | +| **95th Latency** | 95th percentile latency (seconds) | +| **Throughput** | Requests processed per second | +| **Token Throughput** | Output tokens per second | + +## Example Output + +```bash +python bench_sglang.py --api-type transcription --concurrency 128 --model openai/whisper-large-v3 --show-predictions + +Loading dataset: D4nt3/esb-datasets-earnings22-validation-tiny-filtered... +Using API type: transcription +Repo card metadata block was not found. Setting CardData to empty. +WARNING:huggingface_hub.repocard:Repo card metadata block was not found. Setting CardData to empty. +Performing warmup... +Processing 511 samples... +------------------------------ +Results for openai/whisper-large-v3: +Total Requests: 511 +WER: 12.7690 +Average Latency: 1.3602s +Median Latency: 1.2090s +95th Latency: 2.9986s +Throughput: 19.02 req/s +Token Throughput: 354.19 tok/s +Total Test Time: 26.8726s +------------------------------ + +==================== Sample Predictions ==================== +Sample 1: + REF: on the use of taxonomy i you know i think it is it is early days for us to to make any clear indications to the market about the proportion that would fall under that requirement + PRED: on the eu taxonomy i think it is early days for us to make any clear indications to the market about the proportion that would fall under that requirement +---------------------------------------- +Sample 2: + REF: so within fiscal year 2021 say 120 a 100 depending on what the micro will do and next year it is not necessarily payable in q one is we will look at what the cash flows for 2022 look like + PRED: so within fiscal year 2021 say $120000 $100000 depending on what the macro will do and next year it is not necessarily payable in q one is we will look at what the cash flows for 2022 look like +---------------------------------------- +Sample 3: + REF: we talked about 4.7 gigawatts + PRED: we talked about 4.7 gigawatts +---------------------------------------- +Sample 4: + REF: and you know depending on that working capital build we will we will see what that yields + PRED: and depending on that working capital build we will see what that yields what +---------------------------------------- +Sample 5: + REF: so on on sinopec what we have agreed with sinopec way back then is that free cash flows after paying all capexs are distributed out 30 70% + PRED: so on sinopec what we have agreed with sinopec way back then is that free cash flows after paying all capexes are distributed out 30% 70% +---------------------------------------- +============================================================ +``` + +## Notes + +- Audio samples longer than 30 seconds are automatically filtered out (Whisper limitation) +- The benchmark performs a warmup request before measuring performance +- Results are normalized using the model's tokenizer when available +- When using `--stream` with `--show-predictions`, use `--concurrency 1` for clean sequential output +- The `--language` option accepts both full names (e.g., `English`) and ISO 639-1 codes (e.g., `en`) + +## Troubleshooting + +**Server connection refused** +- Ensure the SGLang server is running and accessible at the specified `--base-url` +- Check that the port is not blocked by a firewall + +**Out of memory errors** +- Reduce `--concurrency` to lower GPU memory usage +- Use a smaller Whisper model variant diff --git a/benchmark/asr/bench_sglang.py b/benchmark/asr/bench_sglang.py new file mode 100644 index 000000000000..875ed952bf60 --- /dev/null +++ b/benchmark/asr/bench_sglang.py @@ -0,0 +1,404 @@ +import argparse +import asyncio +import base64 +import io +import json +import time +from statistics import mean, median + +import httpx +import librosa +import numpy as np +import soundfile +from datasets import load_dataset +from evaluate import load +from openai import AsyncOpenAI, OpenAI +from transformers import AutoTokenizer + + +def to_bytes(y, sr): + buffer = io.BytesIO() + soundfile.write(buffer, y, sr, format="WAV") + buffer.seek(0) + return buffer + + +async def run_asr_chat(client, model_name, y, sr): + """Use chat completions API with audio_url for ASR.""" + with to_bytes(y, sr) as f: + audio_bytes = f.read() + audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") + + start_time = time.perf_counter() + response = await client.chat.completions.create( + model=model_name, + messages=[ + { + "role": "user", + "content": [ + { + "type": "audio_url", + "audio_url": {"url": f"data:audio/wav;base64,{audio_base64}"}, + } + ], + } + ], + temperature=0.0, + ) + end_time = time.perf_counter() + + asr_text = response.choices[0].message.content + latency = end_time - start_time + return latency, asr_text + + +def run_asr_transcription_sync(client, model_name, y, sr, language=None): + """Use audio transcriptions API for ASR (sync version).""" + audio_buffer = to_bytes(y, sr) + audio_buffer.name = "audio.wav" # OpenAI client needs a name attribute + + start_time = time.perf_counter() + kwargs = { + "model": model_name, + "file": audio_buffer, + } + if language: + kwargs["language"] = language + + transcription = client.audio.transcriptions.create(**kwargs) + end_time = time.perf_counter() + + latency = end_time - start_time + return latency, transcription.text + + +def run_asr_transcription_stream_sync( + base_url, model_name, y, sr, language=None, show_stream=False +): + """Use audio transcriptions API with streaming for ASR.""" + audio_buffer = to_bytes(y, sr) + audio_bytes = audio_buffer.read() + + data = { + "model": model_name, + "response_format": "json", + "stream": "true", + } + if language: + data["language"] = language + + start_time = time.perf_counter() + text_chunks = [] + + if show_stream: + print("[STREAM] ", end="", flush=True) + + with httpx.stream( + "POST", + f"{base_url}/v1/audio/transcriptions", + data=data, + files={"file": ("audio.wav", audio_bytes, "audio/wav")}, + timeout=60.0, + ) as response: + for line in response.iter_lines(): + if line.startswith("data: ") and not line.startswith("data: [DONE]"): + try: + chunk = json.loads(line[6:]) + if "choices" in chunk and chunk["choices"]: + delta = chunk["choices"][0].get("delta", {}) + content = delta.get("content", "") + if content: + text_chunks.append(content) + if show_stream: + print(content, end="", flush=True) + except json.JSONDecodeError: + pass + + if show_stream: + print() # newline after stream + + end_time = time.perf_counter() + latency = end_time - start_time + return latency, "".join(text_chunks) + + +async def run_asr_transcription( + client, + model_name, + y, + sr, + language=None, + stream=False, + base_url=None, + show_stream=False, +): + """Async wrapper for transcription API (runs sync call in executor).""" + loop = asyncio.get_event_loop() + if stream: + return await loop.run_in_executor( + None, + run_asr_transcription_stream_sync, + base_url, + model_name, + y, + sr, + language, + show_stream, + ) + return await loop.run_in_executor( + None, run_asr_transcription_sync, client, model_name, y, sr, language + ) + + +async def bound_asr( + sem, + client, + model_name, + tokenizer, + audio, + reference, + api_type="chat", + language=None, + stream=False, + base_url=None, + show_stream=False, +): + async with sem: + try: + if api_type == "transcription": + latency, text = await run_asr_transcription( + client, + model_name, + *audio, + language=language, + stream=stream, + base_url=base_url, + show_stream=show_stream, + ) + else: + latency, text = await run_asr_chat(client, model_name, *audio) + + # Calculate tokens for throughput metrics + num_output_tokens = len(tokenizer(text, add_special_tokens=False).input_ids) + + # Normalize for WER evaluation + # Whisper tokenizer has a normalize method + if hasattr(tokenizer, "normalize"): + out = tokenizer.normalize(text) + ref = tokenizer.normalize(reference) + else: + out = text.lower().strip() + ref = reference.lower().strip() + + return latency, num_output_tokens, out, ref + except Exception as e: + print(f"Error during ASR: {e}") + return None + + +async def process_dataset( + model_name, + client, + data, + concurrent_request, + api_type="chat", + language=None, + stream=False, + base_url=None, + show_predictions=False, +): + sem = asyncio.Semaphore(concurrent_request) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + # Warmup + print("Performing warmup...") + audio_warmup, sr_warmup = ( + data[0]["audio"]["array"], + data[0]["audio"]["sampling_rate"], + ) + await bound_asr( + sem, + client, + model_name, + tokenizer, + (audio_warmup, sr_warmup), + "", + api_type=api_type, + language=language, + stream=stream, + base_url=base_url, + show_stream=False, # Don't show stream during warmup + ) + + tasks = [] + print(f"Processing {len(data)} samples...") + for sample in data: + audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] + tasks.append( + asyncio.create_task( + bound_asr( + sem, + client, + model_name, + tokenizer, + (audio, sr), + sample["text"], + api_type=api_type, + language=language, + stream=stream, + base_url=base_url, + show_stream=show_predictions and stream, + ) + ) + ) + + results = await asyncio.gather(*tasks) + return [r for r in results if r is not None] + + +def run_evaluation(args): + # Use sync client for transcription API, async for chat API + if args.api_type == "transcription": + client = OpenAI(base_url=f"{args.base_url}/v1", api_key="None") + else: + client = AsyncOpenAI(base_url=f"{args.base_url}/v1", api_key="None") + + print(f"Loading dataset: {args.dataset}...") + print(f"Using API type: {args.api_type}" + (f" (streaming)" if args.stream else "")) + dataset = load_dataset(args.dataset, split=args.split) + + # Filter by duration if needed (Whisper max is 30s) + def add_duration(sample): + y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] + sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000 + return sample + + if "duration_ms" not in dataset.column_names: + dataset = dataset.map(add_duration) + + dataset = dataset.filter(lambda x: x["duration_ms"] < 30000) + + if args.n_examples > 0: + dataset = dataset.select(range(min(args.n_examples, len(dataset)))) + + start = time.perf_counter() + results = asyncio.run( + process_dataset( + args.model, + client, + dataset, + args.concurrency, + api_type=args.api_type, + language=args.language, + stream=args.stream, + base_url=args.base_url, + show_predictions=args.show_predictions, + ) + ) + total_test_time = time.perf_counter() - start + + if not results: + print("No successful results to evaluate.") + return + + # Metrics + latencies = [res[0] for res in results] + total_tokens = sum([res[1] for res in results]) + predictions = [res[2] for res in results] + references = [res[3] for res in results] + + wer_metric = load("wer") + wer_score = 100 * wer_metric.compute(references=references, predictions=predictions) + + print("-" * 30) + print(f"Results for {args.model}:") + print(f"Total Requests: {len(results)}") + print(f"WER: {wer_score:.4f}") + print(f"Average Latency: {mean(latencies):.4f}s") + print(f"Median Latency: {median(latencies):.4f}s") + print(f"95th Latency: {np.percentile(latencies, 95):.4f}s") + print(f"Throughput: {len(results) / total_test_time:.2f} req/s") + print(f"Token Throughput: {total_tokens / total_test_time:.2f} tok/s") + print(f"Total Test Time: {total_test_time:.4f}s") + print("-" * 30) + + if args.output: + with open(args.output, "w") as f: + import json + + json.dump( + { + "model": args.model, + "dataset": args.dataset, + "wer": wer_score, + "avg_latency": mean(latencies), + "throughput": len(results) / total_test_time, + "token_throughput": total_tokens / total_test_time, + }, + f, + indent=2, + ) + + if args.show_predictions: + print("\n" + "=" * 20 + " Sample Predictions " + "=" * 20) + num_to_show = min(args.print_n, len(results)) + for i in range(num_to_show): + print(f"Sample {i+1}:") + print(f" REF: {references[i]}") + print(f" PRED: {predictions[i]}") + print("-" * 40) + print("=" * 60) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark sGLang ASR performance.") + parser.add_argument( + "--base-url", default="http://localhost:30000", help="sGLang server base URL" + ) + parser.add_argument( + "--model", default="openai/whisper-large-v3", help="Model name on the server" + ) + parser.add_argument( + "--dataset", + default="D4nt3/esb-datasets-earnings22-validation-tiny-filtered", + help="HF dataset repo", + ) + parser.add_argument("--split", default="validation", help="Dataset split") + parser.add_argument( + "--concurrency", type=int, default=4, help="Number of concurrent requests" + ) + parser.add_argument( + "--n-examples", + "-n", + type=int, + default=-1, + help="Number of examples to test (-1 for all)", + ) + parser.add_argument("--output", help="Path to save results in JSON") + parser.add_argument( + "--show-predictions", + action="store_true", + help="Print sample predictions and references", + ) + parser.add_argument( + "--print-n", type=int, default=5, help="Number of sample predictions to print" + ) + parser.add_argument( + "--api-type", + choices=["chat", "transcription"], + default="chat", + help="API type to use: 'chat' for chat completions with audio_url, 'transcription' for audio.transcriptions API", + ) + parser.add_argument( + "--language", + default=None, + help="Language code for transcription API (e.g., 'en')", + ) + parser.add_argument( + "--stream", + action="store_true", + help="Use streaming mode for transcription API", + ) + args = parser.parse_args() + + run_evaluation(args) diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md index ff2769f1e042..cf6a569cbbab 100644 --- a/benchmark/deepseek_v3/README.md +++ b/benchmark/deepseek_v3/README.md @@ -4,7 +4,7 @@ The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVI Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources. -For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://docs.sglang.io/basic_usage/deepseek.html). +For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek V3/V3.1/R1 Model Optimizations in SGLang](https://docs.sglang.io/basic_usage/deepseek_v3.html#optimizations). ## Installation & Launch @@ -33,7 +33,7 @@ Add [performance optimization options](#performance-optimization-options) as nee ```bash # Installation -pip install "sglang[all]>=0.5.6.post2" +pip install sglang # Launch python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code @@ -271,7 +271,7 @@ Then we can benchmark the accuracy and latency by accessing the first node's exp ```bash # bench accuracy -python3 benchmark/gsm8k/bench_sglang.py --num-questions 1319 --host http://10.0.0.1 --port 30000 +python3 benchmark/gsm8k/bench_sglang.py --num-questions 1319 --host 10.0.0.1 --port 30000 # bench latency python3 -m sglang.bench_one_batch_server --model None --base-url http://10.0.0.1:30000 --batch-size 1 --input-len 128 --output-len 128 diff --git a/benchmark/fla/benchmark_layernorm_gated.py b/benchmark/fla/benchmark_layernorm_gated.py index 82440582bc2d..e678d8c31966 100644 --- a/benchmark/fla/benchmark_layernorm_gated.py +++ b/benchmark/fla/benchmark_layernorm_gated.py @@ -7,7 +7,9 @@ from sglang.srt.layers.attention.fla.layernorm_gated import ( _layer_norm_fwd as layer_norm_fwd, ) -from sglang.srt.layers.attention.fla.layernorm_gated import rms_norm_ref +from sglang.srt.layers.attention.fla.layernorm_gated import ( + rms_norm_ref, +) def benchmark_layer_norm_fwd( diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index 98c28b39b373..be766cd9af5c 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -48,6 +48,18 @@ def main(args): # Select backend set_default_backend(select_sglang_backend(args)) + # Load tokenizer if enable_thinking is set + tokenizer = None + if args.enable_thinking: + from transformers import AutoTokenizer + + assert ( + args.tokenizer_path is not None + ), "--tokenizer-path is required when --enable-thinking is set" + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer_path, trust_remote_code=True + ) + # Read data if args.platinum: print("Loading GSM8K Platinum dataset from HuggingFace...") @@ -70,7 +82,16 @@ def main(args): questions = [] labels = [] for i in range(len(lines[:num_questions])): - questions.append(get_one_example(lines, i, False)) + raw_question = few_shot_examples + get_one_example(lines, i, False) + if tokenizer is not None: + messages = [{"role": "user", "content": raw_question}] + raw_question = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + enable_thinking=True, + ) + questions.append(raw_question) labels.append(get_answer_value(lines[i]["answer"])) assert all(l != INVALID for l in labels) arguments = [{"question": q} for q in questions] @@ -83,9 +104,11 @@ def main(args): @sgl.function def few_shot_gsm8k(s, question): - s += few_shot_examples + question + s += question s += sgl.gen( - "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"] + "answer", + max_tokens=args.max_new_tokens, + stop=["Question", "Assistant:", "<|separator|>"], ) ##################################### @@ -96,7 +119,8 @@ def few_shot_gsm8k(s, question): tic = time.perf_counter() states = few_shot_gsm8k.run_batch( arguments, - temperature=0, + temperature=args.temperature, + top_p=args.top_p, num_threads=args.parallel, progress_bar=True, ) @@ -152,6 +176,20 @@ def few_shot_gsm8k(s, question): parser.add_argument("--num-shots", type=int, default=5) parser.add_argument("--data-path", type=str, default="test.jsonl") parser.add_argument("--num-questions", type=int, default=200) + parser.add_argument("--max-new-tokens", type=int, default=512) + parser.add_argument("--temperature", type=float, default=0.0) + parser.add_argument("--top-p", type=float, default=1.0) + parser.add_argument( + "--enable-thinking", + action="store_true", + help="Enable thinking mode by wrapping prompts with chat template", + ) + parser.add_argument( + "--tokenizer-path", + type=str, + default=None, + help="Path to tokenizer (required when --enable-thinking is set)", + ) parser.add_argument( "--platinum", action="store_true", diff --git a/benchmark/hicache/bench_long_context.py b/benchmark/hicache/bench_long_context.py index a3656cef9ea3..dfcecbbc364c 100644 --- a/benchmark/hicache/bench_long_context.py +++ b/benchmark/hicache/bench_long_context.py @@ -12,7 +12,7 @@ ) from tqdm.asyncio import tqdm -from sglang.bench_serving import get_tokenizer +from sglang.benchmark.utils import get_tokenizer class ContextWorkloadGenerator(WorkloadGenerator): @@ -36,20 +36,18 @@ def __init__(self, args): init_requests = [] for i in range(num_requests): context_id = self.dataset["queries"][i]["context"] - init_requests.append( - ( - i, - gen_payload( - self.dataset["contexts"][context_id] - + self.dataset["queries"][i]["question"], - len( - self.tokenizer( - self.dataset["queries"][i]["reference_answer"] - )["input_ids"] - ), - ), - ) + # Tokenize the context + question to get input_ids + prompt_text = ( + self.dataset["contexts"][context_id] + + self.dataset["queries"][i]["question"] ) + input_ids = self.tokenizer.encode(prompt_text) + output_len = len( + self.tokenizer(self.dataset["queries"][i]["reference_answer"])[ + "input_ids" + ] + ) + init_requests.append((i, gen_payload(input_ids, output_len))) self.ready_queue = ReadyQueue(init_requests=init_requests) self.response_queue = queue.Queue() diff --git a/benchmark/hicache/bench_mix.py b/benchmark/hicache/bench_mix.py index cfd25bc4003d..833dbf780add 100644 --- a/benchmark/hicache/bench_mix.py +++ b/benchmark/hicache/bench_mix.py @@ -12,12 +12,9 @@ import aiohttp -from sglang.bench_serving import ( - RequestFuncOutput, - get_tokenizer, - remove_prefix, - sample_random_requests, -) +from sglang.bench_serving import RequestFuncOutput +from sglang.benchmark.datasets.random import sample_random_requests +from sglang.benchmark.utils import get_tokenizer, remove_prefix # Set up logger logger = logging.getLogger(__name__) diff --git a/benchmark/hicache/bench_multiturn.py b/benchmark/hicache/bench_multiturn.py index 95e7c9f5c8d0..d821bbc7b78c 100644 --- a/benchmark/hicache/bench_multiturn.py +++ b/benchmark/hicache/bench_multiturn.py @@ -6,22 +6,21 @@ import threading import time from datetime import datetime -from typing import Optional -import aiohttp import numpy as np import requests from tqdm.asyncio import tqdm -from sglang.bench_serving import ( - RequestFuncOutput, - get_tokenizer, - remove_prefix, - sample_random_requests, +from sglang.bench_serving import RequestFuncOutput +from sglang.benchmark.datasets.random import sample_random_requests +from sglang.benchmark.utils import get_tokenizer +from sglang.test.kits.cache_hit_kit import ( + async_request_openai_chat_completions, + async_request_sglang_generate, + gen_payload, + gen_payload_openai, ) -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60) - def parse_args(): parser = argparse.ArgumentParser( @@ -133,6 +132,24 @@ def parse_args(): default="", help="Tag of a certain run in the log file", ) + parser.add_argument( + "--min-rounds", + type=int, + default=0, + help="Min rounds per client (0 = use --num-rounds)", + ) + parser.add_argument( + "--max-rounds", + type=int, + default=0, + help="Max rounds per client (0 = use --num-rounds)", + ) + parser.add_argument( + "--range-ratio", + type=float, + default=1.0, + help="Length variation ratio for prompts and outputs (1.0 = no variation, 0.5 = 50%% variation)", + ) parser.add_argument("--seed", type=int, default=1, help="The random seed.") parser.add_argument( "--lora-path", @@ -140,98 +157,17 @@ def parse_args(): default="", help="String of LoRA path. Currently we only support benchmarking on a single LoRA adaptor.", ) + parser.add_argument( + "--api-format", + type=str, + default="sglang", + choices=["sglang", "openai"], + help="API format to use: 'sglang' for native /generate endpoint, " + "'openai' for OpenAI-compatible /v1/chat/completions endpoint.", + ) return parser.parse_args() -async def async_request_sglang_generate( - payload, - url, - pbar: Optional[tqdm] = None, -): - """ - Sends a streaming request to the server. Gathers text token-by-token. - """ - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - headers = {} - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - output = RequestFuncOutput() - - try: - async with session.post(url=url, json=payload, headers=headers) as response: - if response.status == 200: - prompt_tokens = 0 - cached_tokens = 0 - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") - latency = time.perf_counter() - st - if chunk == "[DONE]": - pass - else: - data = json.loads(chunk) - - if data["text"]: - timestamp = time.perf_counter() - # First token - if ttft == 0.0: - ttft = time.perf_counter() - st - output.ttft = ttft - prompt_tokens = (data.get("meta_info") or {}).get( - "prompt_tokens", 0 - ) - cached_tokens = (data.get("meta_info") or {}).get( - "cached_tokens", 0 - ) - - # Decoding phase - else: - output.itl.append(timestamp - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text = data["text"] - - output.generated_text = generated_text - output.success = True - output.latency = latency - output.prompt_len = prompt_tokens - output.cached_tokens = cached_tokens - output.generated_len = len(output.itl) + 1 - else: - output.error = response.reason or "" - output.success = False - except Exception as e: - output.success = False - output.error = str(e) - print(f"Request failed: {e}") - - if pbar: - pbar.update(1) - return output - - -def gen_payload(prompt, output_len, lora_path=""): - payload = { - "text": prompt, - "sampling_params": { - "temperature": 0.0, - "max_new_tokens": output_len, - "ignore_eos": True, - }, - "stream": True, - "stream_options": {"include_usage": True}, - "lora_path": lora_path, - "return_logprob": False, - "logprob_start_len": -1, - } - return payload - - def log_to_jsonl_file(data, file_path="performance_metrics.jsonl", tag=""): """Append the data with a timestamp and tag to the specified JSONL file.""" timestamped_data = {"timestamp": datetime.now().isoformat(), "tag": tag, **data} @@ -274,66 +210,159 @@ def pop(self): class WorkloadGenerator: def __init__(self, args): - # Construct the base URL for requests - self.url = f"http://{args.host}:{args.port}/generate" + self.api_format = args.api_format + self.model_path = args.model_path + + # Construct the base URL and select request/payload functions + if self.api_format == "openai": + self.url = f"http://{args.host}:{args.port}/v1/chat/completions" + self.request_func = async_request_openai_chat_completions + else: + self.url = f"http://{args.host}:{args.port}/generate" + self.request_func = async_request_sglang_generate self.tokenizer = get_tokenizer(args.model_path) self.distribution = args.distribution self.request_rate = args.request_rate self.start_time = None self.finished_time = None + self.lora_path = args.lora_path self.sent_requests = 0 self.completed_requests = 0 - self.candidate_inputs = sample_random_requests( + # Resolve per-client round counts + min_rounds = args.min_rounds + max_rounds = args.max_rounds + if min_rounds == 0 and max_rounds == 0: + # Backward compat: all clients use --num-rounds + min_rounds = args.num_rounds + max_rounds = args.num_rounds + elif min_rounds == 0: + min_rounds = max_rounds + elif max_rounds == 0: + max_rounds = min_rounds + if min_rounds < 1: + raise ValueError(f"--min-rounds must be >= 1, got {min_rounds}") + if min_rounds > max_rounds: + raise ValueError( + f"--min-rounds ({min_rounds}) must be <= --max-rounds ({max_rounds})" + ) + + self.min_rounds = min_rounds + self.max_rounds = max_rounds + + if min_rounds == max_rounds: + # All clients have the same round count; skip randint to preserve random state + self.client_total_rounds = [min_rounds] * args.num_clients + else: + self.client_total_rounds = [ + random.randint(min_rounds, max_rounds) for _ in range(args.num_clients) + ] + + # clients_per_round[r] = number of clients participating in round r + self.clients_per_round = [ + sum(1 for t in self.client_total_rounds if t > r) for r in range(max_rounds) + ] + self.total_requests = sum(self.client_total_rounds) + + range_ratio = args.range_ratio + + # Use return_text=False to get token ids instead of text + first_round_samples = sample_random_requests( input_len=args.request_length, output_len=args.output_length, num_prompts=args.num_clients, - range_ratio=1.0, + range_ratio=range_ratio, tokenizer=self.tokenizer, dataset_path=args.dataset_path, random_sample=not args.disable_random_sample, + return_text=False, ) - self.candidate_inputs = [i.prompt for i in self.candidate_inputs] + # Store per-sample output_len for first round + first_round_output_lens = [row.output_len for row in first_round_samples] + # r.prompt is now List[int] when return_text=False + self.candidate_inputs = [list(i.prompt) for i in first_round_samples] if args.sub_question_input_length != 0: sub_question_input_length = args.sub_question_input_length else: sub_question_input_length = args.request_length + num_sub_questions = sum(max(t - 1, 0) for t in self.client_total_rounds) + self.sub_question_inputs = sample_random_requests( input_len=sub_question_input_length, output_len=args.output_length, - num_prompts=args.num_clients * max(args.num_rounds - 1, 1), - range_ratio=1.0, + num_prompts=max(num_sub_questions, 1), + range_ratio=range_ratio, tokenizer=self.tokenizer, dataset_path=args.dataset_path, random_sample=not args.disable_random_sample, + return_text=False, ) - init_requests = [ - ( - i, - gen_payload( - self.candidate_inputs[i], args.output_length, args.lora_path - ), - ) - for i in range(args.num_clients) - ] - self.client_records = { - i: {"round": 0, "history": init_requests[i][1]["text"]} - for i in range(args.num_clients) - } + if self.api_format == "openai": + # OpenAI mode: history is a messages list for /v1/chat/completions + initial_messages = { + i: [ + { + "role": "user", + "content": self.tokenizer.decode(self.candidate_inputs[i]), + } + ] + for i in range(args.num_clients) + } + init_requests = [ + ( + i, + gen_payload_openai( + initial_messages[i], + first_round_output_lens[i], + self.model_path, + ), + ) + for i in range(args.num_clients) + ] + self.client_records = { + i: { + "round": 0, + "history": initial_messages[i], + "total_rounds": self.client_total_rounds[i], + } + for i in range(args.num_clients) + } + else: + # SGLang mode: history is List[int] (token ids) + init_requests = [ + ( + i, + gen_payload( + self.candidate_inputs[i], + first_round_output_lens[i], + args.lora_path, + ), + ) + for i in range(args.num_clients) + ] + self.client_records = { + i: { + "round": 0, + "history": list(self.candidate_inputs[i]), + "total_rounds": self.client_total_rounds[i], + } + for i in range(args.num_clients) + } self.ready_queue = ReadyQueue( init_requests=init_requests, policy=args.ready_queue_policy ) self.candidate_inputs = self.candidate_inputs[args.num_clients :] self.response_queue = queue.Queue() - self.pbar = tqdm(total=args.num_clients * args.num_rounds) + self.pbar = tqdm(total=self.total_requests) self.performance_metrics = { "ttft": [], + "itl": [], "latency": [], "prompt_len": [], "cached_tokens": [], @@ -342,7 +371,7 @@ def __init__(self, args): self.enable_round_barrier = args.enable_round_barrier if self.enable_round_barrier: # Add round-specific metrics while preserving the original structure - for i in range(args.num_rounds): + for i in range(self.max_rounds): self.performance_metrics[f"round_{i}"] = { "ttft": [], "latency": [], @@ -352,19 +381,23 @@ def __init__(self, args): } self.num_clients = args.num_clients - self.num_rounds = args.num_rounds + self.num_rounds = self.max_rounds self.max_parallel = args.max_parallel self.output_length = args.output_length async def handle_request(self, item): + client_id, payload = item try: - client_id, payload = item - response = await async_request_sglang_generate(payload, self.url, self.pbar) + response = await self.request_func(payload, self.url, self.pbar) if self.pbar.n == self.pbar.total: self.finished_time = time.perf_counter() self.response_queue.put((client_id, response)) except Exception as e: - print(f"Request failed: {e}") + print(f"Request failed for client {client_id}: {e}") + failed_response = RequestFuncOutput() + failed_response.success = False + failed_response.error = str(e) + self.response_queue.put((client_id, failed_response)) def request_sender(self): async def request_loop(): @@ -401,17 +434,31 @@ async def request_loop(): def response_handler(self): next_round_reqs = [] + current_barrier_round = 0 + barrier_round_completed = 0 while True: try: client_id, response = self.response_queue.get( timeout=10 ) # Block until response is available if not response.success: - raise ValueError(f"Request failed with error: {response.error}") - self.client_records[client_id]["history"] += response.generated_text + print(f"Request failed for client {client_id}: {response.error}") + self.completed_requests += 1 + continue + # Extend history with response + if self.api_format == "openai": + if response.generated_text: + self.client_records[client_id]["history"].append( + {"role": "assistant", "content": response.generated_text} + ) + else: + self.client_records[client_id]["history"].extend( + response.output_ids + ) current_round = self.client_records[client_id]["round"] self.client_records[client_id]["round"] += 1 self.performance_metrics["ttft"].append(response.ttft) + self.performance_metrics["itl"].extend(response.itl) self.performance_metrics["latency"].append(response.latency) self.performance_metrics["prompt_len"].append(response.prompt_len) self.performance_metrics["cached_tokens"].append(response.cached_tokens) @@ -434,27 +481,59 @@ def response_handler(self): ].append(response.generated_len) self.completed_requests += 1 - if self.client_records[client_id]["round"] < self.num_rounds: - # append new request to client's history - self.client_records[client_id][ - "history" - ] += self.sub_question_inputs.pop().prompt - new_req = ( - client_id, - gen_payload( - self.client_records[client_id]["history"], - self.output_length, - args.lora_path, - ), - ) + client_total = self.client_records[client_id]["total_rounds"] + if self.client_records[client_id]["round"] < client_total: + sub_q = self.sub_question_inputs.pop() + if self.api_format == "openai": + # Append sub-question as a new user message + sub_q_text = self.tokenizer.decode(list(sub_q.prompt)) + self.client_records[client_id]["history"].append( + {"role": "user", "content": sub_q_text} + ) + new_req = ( + client_id, + gen_payload_openai( + self.client_records[client_id]["history"], + sub_q.output_len, + self.model_path, + ), + ) + else: + # Append sub-question token ids to client's history + sub_q_ids = list(sub_q.prompt) + self.client_records[client_id]["history"].extend(sub_q_ids) + new_req = ( + client_id, + gen_payload( + self.client_records[client_id]["history"], + sub_q.output_len, + self.lora_path, + ), + ) if self.enable_round_barrier: next_round_reqs.append(new_req) - if len(next_round_reqs) == self.num_clients: - for req in next_round_reqs: - self.ready_queue.append(req) - next_round_reqs = [] else: self.ready_queue.append(new_req) + + # Barrier logic: release next round when all clients for + # current barrier round have completed + if ( + self.enable_round_barrier + and current_barrier_round < self.max_rounds + ): + barrier_round_completed += 1 + expected = self.clients_per_round[current_barrier_round] + if barrier_round_completed == expected: + print( + f"\n Barrier: round {current_barrier_round} complete " + f"({expected} clients), releasing {len(next_round_reqs)} " + f"requests for round {current_barrier_round + 1}" + ) + for req in next_round_reqs: + self.ready_queue.append(req) + next_round_reqs = [] + current_barrier_round += 1 + barrier_round_completed = 0 except queue.Empty: if self.pbar.n == self.pbar.total: break @@ -477,6 +556,9 @@ def run(self): duration = self.finished_time - self.start_time sorted_ttft = sorted(self.performance_metrics["ttft"]) sorted_latency = sorted(self.performance_metrics["latency"]) + sorted_itl = sorted(self.performance_metrics["itl"]) + sorted_prompt_len = sorted(self.performance_metrics["prompt_len"]) + sorted_output_len = sorted(self.performance_metrics["generated_len"]) def percentile(sorted_vals, q): if not sorted_vals: @@ -505,12 +587,26 @@ def max_or_zero(sorted_vals): if self.performance_metrics["generated_len"] else 0.0 ), + "p90_prompt_len": percentile(sorted_prompt_len, 0.9), + "p99_prompt_len": percentile(sorted_prompt_len, 0.99), + "p90_output_len": percentile(sorted_output_len, 0.9), + "p99_output_len": percentile(sorted_output_len, 0.99), "average_ttft": sum(self.performance_metrics["ttft"]) / len(self.performance_metrics["ttft"]), "p90_ttft": percentile(sorted_ttft, 0.9), "p99_ttft": percentile(sorted_ttft, 0.99), "median_ttft": percentile(sorted_ttft, 0.5), "max_ttft": max_or_zero(sorted_ttft), + "average_itl": ( + sum(self.performance_metrics["itl"]) + / len(self.performance_metrics["itl"]) + if self.performance_metrics["itl"] + else 0.0 + ), + "p90_itl": percentile(sorted_itl, 0.9), + "p99_itl": percentile(sorted_itl, 0.99), + "median_itl": percentile(sorted_itl, 0.5), + "max_itl": max_or_zero(sorted_itl), "average_latency": sum(self.performance_metrics["latency"]) / len(self.performance_metrics["latency"]), "p90_latency": percentile(sorted_latency, 0.9), @@ -534,7 +630,7 @@ def max_or_zero(sorted_vals): } if self.enable_round_barrier: performance_data["round"] = {} - for round_num in range(args.num_rounds): + for round_num in range(self.num_rounds): round_key = f"round_{round_num}" round_metrics = self.performance_metrics[round_key] performance_data["round"][round_key] = { @@ -562,11 +658,28 @@ def max_or_zero(sorted_vals): print( f" Average Output Length: {performance_data['summary']['average_output_len']:.2f} tokens" ) + print( + f" P90 Prompt Length: {performance_data['summary']['p90_prompt_len']:.0f} tokens" + ) + print( + f" P99 Prompt Length: {performance_data['summary']['p99_prompt_len']:.0f} tokens" + ) + print( + f" P90 Output Length: {performance_data['summary']['p90_output_len']:.0f} tokens" + ) + print( + f" P99 Output Length: {performance_data['summary']['p99_output_len']:.0f} tokens" + ) print(f" Average TTFT: {performance_data['summary']['average_ttft']:.2f}") print(f" P90 TTFT: {performance_data['summary']['p90_ttft']:.2f}") print(f" P99 TTFT: {performance_data['summary']['p99_ttft']:.2f}") print(f" Median TTFT: {performance_data['summary']['median_ttft']:.2f}") print(f" Max TTFT: {performance_data['summary']['max_ttft']:.2f}") + print(f" Average ITL: {performance_data['summary']['average_itl']:.4f}") + print(f" P90 ITL: {performance_data['summary']['p90_itl']:.4f}") + print(f" P99 ITL: {performance_data['summary']['p99_itl']:.4f}") + print(f" Median ITL: {performance_data['summary']['median_itl']:.4f}") + print(f" Max ITL: {performance_data['summary']['max_itl']:.4f}") print( f" Average latency: {performance_data['summary']['average_latency']:.2f}" ) @@ -596,10 +709,12 @@ def max_or_zero(sorted_vals): avg_ttft = round_data["average_ttft"] cache_hit_rate = round_data["cache_hit_rate"] request_count = round_data["request_count"] + clients_in_round = self.clients_per_round[round_num] print( f" Round {round_num}: Average TTFT = {avg_ttft:.2f}s, " f"Cache Hit Rate = {cache_hit_rate:.6f} " - f"({request_count} requests)" + f"({request_count} requests, " + f"{clients_in_round} clients)" ) else: print(f" Round {round_num}: No requests completed") diff --git a/benchmark/hicache/bench_serving.py b/benchmark/hicache/bench_serving.py index e38d0d0eaf21..2355e7721c14 100644 --- a/benchmark/hicache/bench_serving.py +++ b/benchmark/hicache/bench_serving.py @@ -32,7 +32,7 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from sglang.bench_serving import get_tokenizer, remove_prefix, set_ulimit +from sglang.benchmark.utils import get_tokenizer, remove_prefix, set_ulimit AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60) diff --git a/benchmark/hicache/data_processing.py b/benchmark/hicache/data_processing.py index dd0cbf669dc0..8c4b8cd1bfb5 100644 --- a/benchmark/hicache/data_processing.py +++ b/benchmark/hicache/data_processing.py @@ -11,13 +11,13 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from sglang.bench_serving import ( +from sglang.benchmark.datasets.common import ( SHAREGPT_FILENAME, SHAREGPT_REPO_ID, - download_and_cache_hf_file, gen_prompt, - get_gen_prefix_cache_path, ) +from sglang.benchmark.datasets.generated_shared_prefix import get_gen_prefix_cache_path +from sglang.benchmark.utils import download_and_cache_hf_file from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path from sglang.srt.entrypoints.openai.protocol import ChatCompletionMessageContentPart from sglang.utils import encode_video_base64 @@ -442,7 +442,15 @@ def sample_generated_shared_prefix_requests( disable_shuffle: bool = False, ) -> SampleOutput: """Generate benchmark requests with shared system prompts using random tokens and caching.""" - cache_path = get_gen_prefix_cache_path(args, tokenizer) + cache_path = get_gen_prefix_cache_path( + args.seed, + num_groups, + prompts_per_group, + system_prompt_len, + question_len, + output_len, + tokenizer, + ) # Try to load from cache first if cache_path.exists(): diff --git a/benchmark/kernels/all_reduce/benchmark_fused_ar_rms_amd.py b/benchmark/kernels/all_reduce/benchmark_fused_ar_rms_amd.py new file mode 100644 index 000000000000..f45a230eebbb --- /dev/null +++ b/benchmark/kernels/all_reduce/benchmark_fused_ar_rms_amd.py @@ -0,0 +1,536 @@ +""" +Benchmark fused allreduce+rmsnorm on AMD with correctness checks. + +This script targets the same fused op used by SGLang: +`tensor_model_parallel_fused_allreduce_rmsnorm`. + +It reports: +- eager mode latency (prefill-like) +- graph mode latency (decode-like) +- fused availability (whether fused path returns non-None) +- correctness (fused output matches split allreduce + rmsnorm reference) + +Usage example: + torchrun --nproc_per_node=8 \ + benchmark/kernels/all_reduce/benchmark_fused_ar_rms_amd.py \ + --dtype bfloat16 \ + --prefill-shapes 2048x8192,8192x8192 \ + --decode-shapes 1x8192,4x8192,16x8192 \ + --warmup 10 --iters 30 --repeats 5 +""" + +import argparse +import csv +import os +import statistics +from typing import Dict, List, Optional, Sequence, Tuple + +import torch +import torch.distributed as dist +import torch.nn.functional as F + +from sglang.srt.distributed.communication_op import ( + tensor_model_parallel_all_reduce, + tensor_model_parallel_fused_allreduce_rmsnorm, +) +from sglang.srt.distributed.parallel_state import ( + destroy_distributed_environment, + destroy_model_parallel, + graph_capture, + init_distributed_environment, + initialize_model_parallel, + set_custom_all_reduce, +) + +Shape = Tuple[int, int] + + +def parse_shapes(raw: str) -> List[Shape]: + shapes: List[Shape] = [] + for item in [x.strip() for x in raw.split(",") if x.strip()]: + if "x" not in item: + raise ValueError(f"Invalid shape '{item}', expected MxN format.") + m_str, n_str = item.split("x", 1) + m = int(m_str) + n = int(n_str) + if m <= 0 or n <= 0: + raise ValueError(f"Invalid shape '{item}', both dims must be positive.") + shapes.append((m, n)) + if not shapes: + raise ValueError("Empty shape list is not allowed.") + return shapes + + +def dtype_from_name(name: str) -> torch.dtype: + mapping = { + "float16": torch.float16, + "fp16": torch.float16, + "bfloat16": torch.bfloat16, + "bf16": torch.bfloat16, + } + if name not in mapping: + raise ValueError(f"Unsupported dtype: {name}") + return mapping[name] + + +def check_close( + a: torch.Tensor, b: torch.Tensor, dtype: torch.dtype +) -> Tuple[bool, str]: + if dtype == torch.bfloat16: + rtol, atol = 2e-2, 1.25e-1 + else: + rtol, atol = 1e-2, 2e-2 + try: + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) + return True, "PASS" + except AssertionError: + max_diff = torch.max(torch.abs(a - b)).item() + mean_diff = torch.mean(torch.abs(a - b)).item() + return False, f"FAIL(max={max_diff:.6f},mean={mean_diff:.6f})" + + +def _measure_us( + fn, + warmup: int, + iters: int, + repeats: int, + device: torch.device, +) -> Tuple[float, Dict[str, float]]: + for _ in range(warmup): + fn() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + samples_us: List[float] = [] + + for _ in range(max(1, repeats)): + _barrier(device) + torch.cuda.synchronize() + start_event.record() + for _ in range(iters): + fn() + end_event.record() + end_event.synchronize() + samples_us.append(start_event.elapsed_time(end_event) * 1000.0 / iters) + + sorted_samples = sorted(samples_us) + p50 = float(statistics.median(sorted_samples)) + p95 = float(sorted_samples[int((len(sorted_samples) - 1) * 0.95)]) + return p50, { + "p50_us": p50, + "p95_us": p95, + "min_us": float(sorted_samples[0]), + "max_us": float(sorted_samples[-1]), + } + + +def _barrier(device: torch.device): + try: + dist.barrier(device_ids=[device.index]) + except TypeError: + dist.barrier() + + +def _mean_across_ranks(value: float, device: torch.device) -> float: + t = torch.tensor([value], dtype=torch.float64, device=device) + dist.all_reduce(t, op=dist.ReduceOp.SUM) + t /= dist.get_world_size() + return float(t.item()) + + +def _all_true_across_ranks(value: bool, device: torch.device) -> bool: + t = torch.tensor([1 if value else 0], dtype=torch.int32, device=device) + dist.all_reduce(t, op=dist.ReduceOp.MIN) + return bool(int(t.item())) + + +def _make_inputs( + shape: Shape, + dtype: torch.dtype, + seed: int, + residual_mode: str, + rank: int, + device: torch.device, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + m, n = shape + torch.manual_seed(seed + rank * 17) + x = torch.randn((m, n), dtype=torch.float32, device=device).to(dtype) + if residual_mode == "self": + residual = x.clone() + elif residual_mode == "random": + residual = torch.randn((m, n), dtype=torch.float32, device=device).to(dtype) + elif residual_mode == "zero": + residual = torch.zeros((m, n), dtype=dtype, device=device) + else: + raise ValueError(f"Unknown residual_mode: {residual_mode}") + weight = torch.randn((n,), dtype=torch.float32, device=device).to(dtype) + return x, residual, weight + + +def _split_reference( + x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float +) -> Tuple[torch.Tensor, torch.Tensor]: + ar_out = tensor_model_parallel_all_reduce(x.clone()) + residual_out = ar_out + residual + out = F.rms_norm( + input=residual_out, + normalized_shape=(residual_out.shape[-1],), + weight=weight, + eps=eps, + ) + return out, residual_out + + +def bench_eager( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + eps: float, + warmup: int, + iters: int, + repeats: int, +) -> Dict[str, object]: + split_fn = lambda: _split_reference(x, residual, weight, eps) + split_us, split_stats = _measure_us(split_fn, warmup, iters, repeats, x.device) + + fused_probe = tensor_model_parallel_fused_allreduce_rmsnorm( + x.clone(), residual.clone(), weight, eps + ) + fused_available = fused_probe is not None + + fused_us: Optional[float] = None + fused_stats: Optional[Dict[str, float]] = None + if fused_available: + fused_fn = lambda: tensor_model_parallel_fused_allreduce_rmsnorm( + x, residual, weight, eps + ) + fused_us, fused_stats = _measure_us(fused_fn, warmup, iters, repeats, x.device) + + ref_out, ref_residual = _split_reference(x, residual, weight, eps) + if fused_available: + fused_out, fused_residual = tensor_model_parallel_fused_allreduce_rmsnorm( + x.clone(), residual.clone(), weight, eps + ) + out_ok, out_detail = check_close(fused_out, ref_out, x.dtype) + res_ok, res_detail = check_close(fused_residual, ref_residual, x.dtype) + correctness_ok = out_ok and res_ok + correctness_detail = f"out={out_detail}, residual={res_detail}" + else: + correctness_ok = True + correctness_detail = "SKIP(fused_unavailable)" + + return { + "split_us": split_us, + "split_stats": split_stats, + "fused_available": fused_available, + "fused_us": fused_us, + "fused_stats": fused_stats, + "correctness_ok": correctness_ok, + "correctness_detail": correctness_detail, + } + + +def bench_graph( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + eps: float, + warmup: int, + iters: int, + repeats: int, +) -> Dict[str, object]: + split_x = x.clone() + split_res = residual.clone() + split_graph_out: Optional[torch.Tensor] = None + + with graph_capture() as gc: + split_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(split_graph, stream=gc.stream): + split_graph_out, _ = _split_reference(split_x, split_res, weight, eps) + + def split_replay(): + split_graph.replay() + + split_us, split_stats = _measure_us(split_replay, warmup, iters, repeats, x.device) + + fused_probe = tensor_model_parallel_fused_allreduce_rmsnorm( + x.clone(), residual.clone(), weight, eps + ) + fused_available = fused_probe is not None + + fused_us: Optional[float] = None + fused_stats: Optional[Dict[str, float]] = None + fused_graph_out: Optional[torch.Tensor] = None + fused_graph_residual: Optional[torch.Tensor] = None + + if fused_available: + fused_x = x.clone() + fused_res = residual.clone() + with graph_capture() as gc: + fused_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(fused_graph, stream=gc.stream): + fused_graph_out, fused_graph_residual = ( + tensor_model_parallel_fused_allreduce_rmsnorm( + fused_x, fused_res, weight, eps + ) + ) + + def fused_replay(): + fused_graph.replay() + + fused_us, fused_stats = _measure_us( + fused_replay, warmup, iters, repeats, x.device + ) + + ref_out, ref_residual = _split_reference(x, residual, weight, eps) + if ( + fused_available + and fused_graph_out is not None + and fused_graph_residual is not None + ): + fused_graph.replay() + torch.cuda.synchronize() + out_ok, out_detail = check_close(fused_graph_out, ref_out, x.dtype) + res_ok, res_detail = check_close(fused_graph_residual, ref_residual, x.dtype) + correctness_ok = out_ok and res_ok + correctness_detail = f"out={out_detail}, residual={res_detail}" + else: + correctness_ok = True + correctness_detail = "SKIP(fused_unavailable)" + + return { + "split_us": split_us, + "split_stats": split_stats, + "fused_available": fused_available, + "fused_us": fused_us, + "fused_stats": fused_stats, + "correctness_ok": correctness_ok, + "correctness_detail": correctness_detail, + } + + +def _shape_bytes(shape: Shape, dtype: torch.dtype) -> int: + m, n = shape + return m * n * torch.tensor([], dtype=dtype).element_size() + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Benchmark fused allreduce+rmsnorm (prefill eager + decode graph)." + ) + parser.add_argument( + "--dtype", + type=str, + default="bf16", + choices=["fp16", "bf16", "float16", "bfloat16"], + ) + parser.add_argument("--eps", type=float, default=1e-6) + parser.add_argument("--seed", type=int, default=1234) + parser.add_argument( + "--residual-mode", + type=str, + default="self", + choices=["self", "random", "zero"], + help="Use residual=x (self) to match aiter test behavior by default.", + ) + parser.add_argument( + "--prefill-shapes", + type=str, + default="2048x8192,8192x8192,16384x8192", + help="Comma-separated MxN shapes for eager mode.", + ) + parser.add_argument( + "--decode-shapes", + type=str, + default="1x8192,2x8192,4x8192,8x8192,16x8192", + help="Comma-separated MxN shapes for graph mode.", + ) + parser.add_argument("--warmup", type=int, default=10) + parser.add_argument("--iters", type=int, default=30) + parser.add_argument("--repeats", type=int, default=5) + parser.add_argument( + "--mode", + type=str, + default="both", + choices=["eager", "graph", "both"], + ) + parser.add_argument( + "--csv-out", + type=str, + default=None, + help="Optional output CSV path (written on rank 0 only).", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + dtype = dtype_from_name(args.dtype) + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", str(rank))) + torch.cuda.set_device(local_rank % torch.cuda.device_count()) + device = torch.device(f"cuda:{local_rank % torch.cuda.device_count()}") + + set_custom_all_reduce(True) + init_distributed_environment( + world_size=world_size, + rank=rank, + local_rank=local_rank, + distributed_init_method="env://", + backend="nccl", + ) + initialize_model_parallel(tensor_model_parallel_size=world_size) + + prefill_shapes = parse_shapes(args.prefill_shapes) + decode_shapes = parse_shapes(args.decode_shapes) + + if rank == 0: + print( + "Config: " + f"world_size={world_size}, dtype={dtype}, residual_mode={args.residual_mode}, " + f"warmup={args.warmup}, iters={args.iters}, repeats={args.repeats}" + ) + + run_modes: Sequence[str] + if args.mode == "both": + run_modes = ("eager", "graph") + else: + run_modes = (args.mode,) + csv_rows: List[Dict[str, object]] = [] + + for mode in run_modes: + shapes = prefill_shapes if mode == "eager" else decode_shapes + if rank == 0: + phase_name = "prefill(eager)" if mode == "eager" else "decode(graph)" + print("\n" + "=" * 120) + print(f"Mode: {phase_name}") + print( + "| Shape | Input bytes/rank | Split p50 (us) | Fused p50 (us) | Speedup | Fused available | Correctness |" + ) + print( + "|:------|-----------------:|---------------:|---------------:|--------:|:----------------|:------------|" + ) + + for shape in shapes: + x, residual, weight = _make_inputs( + shape=shape, + dtype=dtype, + seed=args.seed, + residual_mode=args.residual_mode, + rank=rank, + device=device, + ) + + if mode == "eager": + metrics = bench_eager( + x=x, + residual=residual, + weight=weight, + eps=args.eps, + warmup=args.warmup, + iters=args.iters, + repeats=args.repeats, + ) + else: + metrics = bench_graph( + x=x, + residual=residual, + weight=weight, + eps=args.eps, + warmup=args.warmup, + iters=args.iters, + repeats=args.repeats, + ) + + split_us = _mean_across_ranks(float(metrics["split_us"]), device) + fused_available = _all_true_across_ranks( + bool(metrics["fused_available"]), device + ) + correctness_ok = _all_true_across_ranks( + bool(metrics["correctness_ok"]), device + ) + + fused_us: Optional[float] = None + if fused_available and metrics["fused_us"] is not None: + fused_us = _mean_across_ranks(float(metrics["fused_us"]), device) + + if rank == 0: + m, n = shape + shape_str = f"{m}x{n}" + bytes_per_rank = _shape_bytes(shape, dtype) + if fused_us is not None and fused_us > 0: + speedup = split_us / fused_us + speedup_str = f"{speedup:.3f}x" + fused_str = f"{fused_us:.1f}" + else: + speedup_str = "N/A" + fused_str = "N/A" + correctness_text = ( + "PASS" if correctness_ok else str(metrics["correctness_detail"]) + ) + print( + f"| {shape_str} | {bytes_per_rank} | {split_us:.1f} | {fused_str} | " + f"{speedup_str} | {str(fused_available)} | {correctness_text} |" + ) + csv_rows.append( + { + "mode": mode, + "shape": shape_str, + "m": m, + "n": n, + "bytes_per_rank": bytes_per_rank, + "split_p50_us": split_us, + "fused_p50_us": fused_us if fused_us is not None else "", + "speedup_split_over_fused": ( + split_us / fused_us + if fused_us is not None and fused_us > 0 + else "" + ), + "fused_available": fused_available, + "correctness_ok": correctness_ok, + "correctness_detail": correctness_text, + "dtype": str(dtype), + "world_size": world_size, + "residual_mode": args.residual_mode, + "warmup": args.warmup, + "iters": args.iters, + "repeats": args.repeats, + } + ) + + if rank == 0 and args.csv_out: + os.makedirs(os.path.dirname(args.csv_out) or ".", exist_ok=True) + fieldnames = [ + "mode", + "shape", + "m", + "n", + "bytes_per_rank", + "split_p50_us", + "fused_p50_us", + "speedup_split_over_fused", + "fused_available", + "correctness_ok", + "correctness_detail", + "dtype", + "world_size", + "residual_mode", + "warmup", + "iters", + "repeats", + ] + with open(args.csv_out, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(csv_rows) + print(f"\nSaved CSV to: {args.csv_out}") + + _barrier(device) + destroy_model_parallel() + destroy_distributed_environment() + + +if __name__ == "__main__": + main() diff --git a/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py index b418855a2188..df2952b29068 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py @@ -18,7 +18,13 @@ triton_kernel_moe_forward, ) from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig -from sglang.srt.layers.moe.topk import TopK, TopKConfig, select_experts +from sglang.srt.layers.moe.topk import ( + TopK, + TopKConfig, + TopKOutputFormat, + select_experts, +) +from sglang.srt.server_args import ServerArgs, set_global_server_args_for_scheduler def fused_moe_triton_api( @@ -32,8 +38,8 @@ def fused_moe_triton_api( top_k=topk, renormalize=False, use_grouped_topk=False, + output_format=TopKOutputFormat.TRITON_KERNEL, ) - topk_op.use_triton_kernels = True triton_topk_output = topk_op.forward_cuda( hidden_states=x, router_logits=input_gating, @@ -199,6 +205,10 @@ def main(): parser.add_argument("--trust-remote-code", action="store_true") args = parser.parse_args() + # Initialize global server args (required by SGLang MoE kernels) + server_args = ServerArgs(model_path=args.model) + set_global_server_args_for_scheduler(server_args) + try: if not torch.distributed.is_initialized(): torch.distributed.init_process_group( @@ -217,8 +227,8 @@ def main(): ) initialize_model_parallel( - tensor_model_parallel_size=args.ep_size, - pipeline_model_parallel_size=args.tp_size, + tensor_model_parallel_size=1, + expert_model_parallel_size=1, ) model_config = get_model_config(args.model, args.tp_size, args.ep_size) diff --git a/benchmark/kernels/fused_moe_triton/common_utils.py b/benchmark/kernels/fused_moe_triton/common_utils.py index 5f2d9aa8a244..adac313a11b1 100644 --- a/benchmark/kernels/fused_moe_triton/common_utils.py +++ b/benchmark/kernels/fused_moe_triton/common_utils.py @@ -38,6 +38,10 @@ def get_model_config( ) -> Dict: config = get_config(model_name, trust_remote_code=True) + # Replace config with text_config for encoder-decoder models after getting block_shape and architecture + if hasattr(config, "text_config"): + config = config.get_text_config() + block_shape = None if ( hasattr(config, "quantization_config") @@ -46,11 +50,19 @@ def get_model_config( block_shape = config.quantization_config["weight_block_size"] assert len(block_shape) == 2 - architecture = config.architectures[0] + if ( + hasattr(config, "quantization_config") + and "config_groups" in config.quantization_config + ): + config_groups = config.quantization_config["config_groups"] + # Get group_size from the first group's weights config + first_group = next(iter(config_groups.values()), {}) + weights_config = first_group.get("weights", {}) + group_size = weights_config.get("group_size") + block_shape = [0, group_size] + assert len(block_shape) == 2 - # Replace config with text_config for encoder-decoder models after getting block_shape and architecture - if hasattr(config, "text_config"): - config = config.get_text_config() + architecture = config.architectures[0] hidden_size = config.hidden_size if architecture == "DbrxForCausalLM": @@ -66,6 +78,7 @@ def get_model_config( "Qwen3MoeForCausalLM", "Qwen3NextForCausalLM", "Qwen3VLMoeForConditionalGeneration", + "Qwen3_5MoeForConditionalGeneration", ]: E = config.num_experts // ep_size topk = config.num_experts_per_tok @@ -222,6 +235,7 @@ def get_config_filename( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, per_channel_quant: bool, block_shape: List[int], ) -> str: @@ -230,13 +244,18 @@ def get_config_filename( use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a8=use_int8_w8a8, + use_int4_w4a16=use_int4_w4a16, ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. + N = shard_intermediate_size // 2 + if use_int4_w4a16: + N = N // 2 + filename = get_config_file_name( num_experts, - shard_intermediate_size // 2, + N, dtype_str, block_shape, per_channel_quant, diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py index aef7ed8f6ca7..34aa83b38fd2 100644 --- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py @@ -28,6 +28,10 @@ ) from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig from sglang.srt.layers.moe.topk import TopKConfig, select_experts +from sglang.srt.server_args import ( + ServerArgs, + set_global_server_args_for_scheduler, +) from sglang.srt.utils import is_hip _is_hip = is_hip() @@ -44,6 +48,7 @@ def benchmark_config( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, per_channel_quant: bool, block_shape: List[int] = None, num_iters: int = 100, @@ -71,6 +76,27 @@ def benchmark_config( ), dtype=torch.int8, ) + elif use_int4_w4a16: + w1 = torch.randint( + 0, + 255, + ( + num_experts, + shard_intermediate_size, + hidden_size // 2, + ), + dtype=torch.uint8, + ) + w2 = torch.randint( + 0, + 255, + ( + num_experts, + hidden_size, + shard_intermediate_size // 4, + ), + dtype=torch.uint8, + ) else: w1 = torch.randn( num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype @@ -89,6 +115,19 @@ def benchmark_config( (num_experts, 2 * shard_intermediate_size), dtype=torch.float32 ) w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) + if use_int4_w4a16: + block_n = 1 if (block_shape[0] == 0) else block_shape[0] + block_k = block_shape[1] + n_tiles_w1 = (shard_intermediate_size + block_n - 1) // block_n + n_tiles_w2 = (hidden_size + block_n - 1) // block_n + k_tiles_w1 = (hidden_size + block_k - 1) // block_k + k_tiles_w2 = (shard_intermediate_size // 2 + block_k - 1) // block_k + w1_scale = torch.randn( + (num_experts, n_tiles_w1, k_tiles_w1), dtype=torch.bfloat16 + ) + w2_scale = torch.randn( + (num_experts, n_tiles_w2, k_tiles_w2), dtype=torch.bfloat16 + ) if use_fp8_w8a8 or use_int8_w8a8: if use_int8_w8a8 and block_shape is None: w1_scale = torch.randn( @@ -146,6 +185,7 @@ def run(): use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a8=use_int8_w8a8, use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, w1_scale=w1_scale, w2_scale=w2_scale, a1_scale=a1_scale, @@ -195,13 +235,14 @@ def run(): @ray.remote(num_gpus=1) class BenchmarkWorker: - def __init__(self, seed: int) -> None: + def __init__(self, seed: int, server_args: ServerArgs) -> None: torch.set_default_device("cuda") torch.cuda.manual_seed_all(0) self.seed = seed # Get the device ID to allocate tensors and kernels # on the respective GPU. self.device_id = int(ray.get_gpu_ids()[0]) + set_global_server_args_for_scheduler(server_args) def benchmark( self, @@ -214,20 +255,27 @@ def benchmark( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, per_channel_quant: bool, block_shape: List[int], ) -> Tuple[Dict[str, int], float]: torch.cuda.manual_seed_all(0) dtype_str = get_config_dtype_str( - dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 + dtype, + use_int8_w8a16=use_int8_w8a16, + use_fp8_w8a8=use_fp8_w8a8, + use_int4_w4a16=use_int4_w4a16, ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. block_n = block_shape[0] if block_shape else 0 block_k = block_shape[1] if block_shape else 0 + N = shard_intermediate_size // 2 + if use_int4_w4a16: + N = N // 2 op_config = get_moe_configs( num_experts, - shard_intermediate_size // 2, + N, dtype_str, block_n, block_k, @@ -258,6 +306,7 @@ def benchmark( use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, per_channel_quant, block_shape, ) @@ -274,6 +323,7 @@ def tune( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, per_channel_quant: bool, block_shape: List[int], search_space: List[Dict[str, int]], @@ -294,6 +344,7 @@ def tune( use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, per_channel_quant, block_shape, num_iters=10, @@ -312,7 +363,9 @@ def tune( def main(args: argparse.Namespace): - print(args) + server_args = ServerArgs( + model_path=args.model, tp_size=args.tp_size, ep_size=args.ep_size + ) model_config = get_model_config( args.model, args.tp_size, args.ep_size, args.disable_shared_experts_fusion @@ -328,6 +381,7 @@ def main(args: argparse.Namespace): use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a8 = args.dtype == "int8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" + use_int4_w4a16 = args.dtype == "int4_w4a16" per_channel_quant = args.per_channel_quant if args.batch_size is None: @@ -337,7 +391,7 @@ def main(args: argparse.Namespace): ray.init() num_gpus = int(ray.available_resources()["GPU"]) - workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] + workers = [BenchmarkWorker.remote(args.seed, server_args) for _ in range(num_gpus)] def _distribute(method: str, inputs: List[Any]) -> List[Any]: outputs = [] @@ -369,6 +423,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, per_channel_quant, block_shape, ) @@ -390,6 +445,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, per_channel_quant, block_shape, search_space, @@ -420,6 +476,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, per_channel_quant, block_shape, ) @@ -442,7 +499,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: parser.add_argument( "--dtype", type=str, - choices=["auto", "fp8_w8a8", "int8_w8a16", "int8_w8a8"], + choices=["auto", "fp8_w8a8", "int8_w8a16", "int8_w8a8", "int4_w4a16"], default="auto", ) parser.add_argument( diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton_sep.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton_sep.py index a903a15a9ec0..8d4afbe84d52 100644 --- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton_sep.py +++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton_sep.py @@ -32,6 +32,10 @@ ) from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig from sglang.srt.layers.moe.topk import TopKConfig, select_experts +from sglang.srt.server_args import ( + ServerArgs, + set_global_server_args_for_scheduler, +) from sglang.srt.utils import is_hip _is_hip = is_hip() @@ -132,6 +136,7 @@ def benchmark_config( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, topk_ids_list, block_shape: List[int] = None, ep_size: int = 1, @@ -163,6 +168,27 @@ def benchmark_config( ), dtype=torch.int8, ) + elif use_int4_w4a16: + w1 = torch.randint( + 0, + 255, + ( + num_experts, + shard_intermediate_size, + hidden_size // 2, + ), + dtype=torch.uint8, + ) + w2 = torch.randint( + 0, + 255, + ( + num_experts, + hidden_size, + shard_intermediate_size // 4, + ), + dtype=torch.uint8, + ) else: w1 = torch.randn( num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype @@ -180,6 +206,19 @@ def benchmark_config( (num_experts, 2 * shard_intermediate_size), dtype=torch.float32 ) w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) + if use_int4_w4a16: + block_n = 1 if (block_shape[0] == 0) else block_shape[0] + block_k = block_shape[1] + n_tiles_w1 = (shard_intermediate_size + block_n - 1) // block_n + n_tiles_w2 = (hidden_size + block_n - 1) // block_n + k_tiles_w1 = (hidden_size + block_k - 1) // block_k + k_tiles_w2 = (shard_intermediate_size // 2 + block_k - 1) // block_k + w1_scale = torch.randn( + (num_experts, n_tiles_w1, k_tiles_w1), dtype=torch.bfloat16 + ) + w2_scale = torch.randn( + (num_experts, n_tiles_w2, k_tiles_w2), dtype=torch.bfloat16 + ) if use_fp8_w8a8 or use_int8_w8a8: if use_int8_w8a8 and block_shape is None: w1_scale = torch.randn( @@ -284,7 +323,7 @@ def get_kernel_wrapper(moe_use_tma, inner_iter, use_cuda_graph): B=w1, bias=None, C=intermediate_cache1, - A_scale=None, + A_scale=a1_scale, B_scale=w1_scale, B_zp=None, topk_weights=topk_output_.topk_weights, @@ -294,9 +333,9 @@ def get_kernel_wrapper(moe_use_tma, inner_iter, use_cuda_graph): config=config, compute_type=compute_type, use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, per_channel_quant=False, block_shape=block_shape, b_use_tma=moe_use_tma, @@ -320,9 +359,9 @@ def get_kernel_wrapper(moe_use_tma, inner_iter, use_cuda_graph): config=config, compute_type=compute_type, use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, per_channel_quant=False, block_shape=block_shape, a_use_tma=moe_use_tma, @@ -405,13 +444,14 @@ def config_dict(self, block_m): class BenchmarkWorker: - def __init__(self, seed: int) -> None: + def __init__(self, seed: int, server_args: ServerArgs) -> None: torch.set_default_device("cuda") torch.cuda.manual_seed_all(0) self.seed = seed # Get the device ID to allocate tensors and kernels # on the respective GPU. self.device_id = 0 # int(ray.get_gpu_ids()[0]) + set_global_server_args_for_scheduler(server_args) def benchmark( self, @@ -424,6 +464,7 @@ def benchmark( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, block_shape: List[int], cfg: Dict[str, int], topk_ids_dir: str, @@ -443,6 +484,7 @@ def benchmark( use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, topk_ids_list, block_shape, ep_size=ep_size, @@ -460,6 +502,7 @@ def tune( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, block_shape: List[int], search_space: List[Dict[str, int]], topk_ids_dir: str, @@ -483,6 +526,7 @@ def tune( use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, topk_ids_list, block_shape, ep_size=ep_size, @@ -527,6 +571,7 @@ def cmp_configs( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, block_shape: List[int], cmp_config_files: List[str], topk_ids_dir: str, @@ -562,6 +607,7 @@ def cmp_configs( use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, topk_ids_list, block_shape, ep_size=ep_size, @@ -582,6 +628,7 @@ def save_configs_sep( use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, block_shape: List[int], down_moe: bool = False, ) -> None: @@ -590,6 +637,7 @@ def save_configs_sep( use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a8=use_int8_w8a8, + use_int4_w4a16=use_int4_w4a16, ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which @@ -611,6 +659,10 @@ def save_configs_sep( def main(args: argparse.Namespace): print(args) + server_args = ServerArgs( + model_path=args.model, tp_size=args.tp_size, ep_size=args.ep_size + ) + model_config = get_model_config( args.model, args.tp_size, @@ -629,6 +681,7 @@ def main(args: argparse.Namespace): use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a8 = args.dtype == "int8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" + use_int4_w4a16 = args.dtype == "int4_w4a16" topk_ids_dir = args.topk_ids_dir if args.batch_size is None: @@ -638,7 +691,7 @@ def main(args: argparse.Namespace): batch_sizes = [args.batch_size] if args.cmp_configs is not None: - worker = BenchmarkWorker(args.seed) + worker = BenchmarkWorker(args.seed, server_args) worker.cmp_configs( batch_sizes, E, @@ -649,6 +702,7 @@ def main(args: argparse.Namespace): use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, block_shape, args.cmp_configs, topk_ids_dir, @@ -657,7 +711,7 @@ def main(args: argparse.Namespace): return if len(batch_sizes) == 1: - worker = BenchmarkWorker(args.seed) + worker = BenchmarkWorker(args.seed, server_args) if args.tune: search_space = get_configs_compute_bound() worker.tune( @@ -670,6 +724,7 @@ def main(args: argparse.Namespace): use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, block_shape, search_space, topk_ids_dir, @@ -695,6 +750,7 @@ def main(args: argparse.Namespace): use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, block_shape, cfg, topk_ids_dir, @@ -708,7 +764,7 @@ def main(args: argparse.Namespace): ray.init() num_gpus = int(ray.available_resources()["GPU"]) workers = [ - ray.remote(num_gpus=1)(BenchmarkWorker).remote(args.seed) + ray.remote(num_gpus=1)(BenchmarkWorker).remote(args.seed, server_args) for _ in range(num_gpus) ] @@ -738,6 +794,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, False, block_shape, ) @@ -759,6 +816,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, block_shape, search_space, topk_ids_dir, @@ -787,6 +845,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, block_shape, ) @@ -801,6 +860,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, block_shape, down_moe=True, ) @@ -818,7 +878,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: parser.add_argument( "--dtype", type=str, - choices=["auto", "fp8_w8a8", "int8_w8a16", "int8_w8a8"], + choices=["auto", "fp8_w8a8", "int8_w8a16", "int8_w8a8", "int8_w4a16"], default="auto", ) parser.add_argument("--seed", type=int, default=0) diff --git a/benchmark/lora/lora_bench.py b/benchmark/lora/lora_bench.py index 4f380c705122..7d3397c0ef75 100644 --- a/benchmark/lora/lora_bench.py +++ b/benchmark/lora/lora_bench.py @@ -35,10 +35,9 @@ _create_bench_client_session, calculate_metrics, get_request, - get_tokenizer, - remove_prefix, - sample_random_requests, ) +from sglang.benchmark.datasets.random import sample_random_requests +from sglang.benchmark.utils import get_tokenizer, remove_prefix global args diff --git a/benchmark/mmlu/bench_sglang.py b/benchmark/mmlu/bench_sglang.py index 23057be4aed8..9a2006e3d2c1 100644 --- a/benchmark/mmlu/bench_sglang.py +++ b/benchmark/mmlu/bench_sglang.py @@ -1,6 +1,8 @@ import argparse import json import os +import subprocess +import tarfile import time import numpy as np @@ -13,6 +15,8 @@ select_sglang_backend, ) +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + choices = ["A", "B", "C", "D"] tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") @@ -48,6 +52,28 @@ def gen_prompt(train_df, subject, k=-1): return prompt +def download_data(data_dir): + """Download and extract MMLU data if it doesn't exist.""" + if os.path.isdir(os.path.join(data_dir, "test")): + return + print(f"Data not found at {data_dir}. Downloading...") + os.makedirs(data_dir, exist_ok=True) + tar_path = os.path.join(data_dir, "data.tar") + subprocess.check_call( + ["wget", "-O", tar_path, "https://people.eecs.berkeley.edu/~hendrycks/data.tar"] + ) + with tarfile.open(tar_path) as tar: + tar.extractall(path=data_dir, filter="data") + # The tarball extracts into a "data/" subdirectory; move contents up if needed + nested = os.path.join(data_dir, "data") + if os.path.isdir(nested): + for item in os.listdir(nested): + os.rename(os.path.join(nested, item), os.path.join(data_dir, item)) + os.rmdir(nested) + os.remove(tar_path) + print("Download complete.") + + def main(args): subjects = sorted( [ @@ -174,8 +200,11 @@ def few_shot_mmlu(s, examples, question): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--ntrain", "-k", type=int, default=5) - parser.add_argument("--data_dir", "-d", type=str, default="data") + parser.add_argument( + "--data_dir", "-d", type=str, default=os.path.join(SCRIPT_DIR, "data") + ) parser.add_argument("--save_dir", "-s", type=str, default="results") parser.add_argument("--nsub", type=int, default=60) args = add_common_sglang_args_and_parse(parser) + download_data(args.data_dir) main(args) diff --git a/benchmark/tip_suggestion/bench_other.py b/benchmark/tip_suggestion/bench_other.py index 2630081bd620..6e3d098fe5e7 100644 --- a/benchmark/tip_suggestion/bench_other.py +++ b/benchmark/tip_suggestion/bench_other.py @@ -13,8 +13,7 @@ def expand_tip(topic, tip, generate): - s = ( - """Please expand a tip for a topic into a detailed paragraph. + s = """Please expand a tip for a topic into a detailed paragraph. Topic: staying healthy Tip: Regular Exercise @@ -28,12 +27,7 @@ def expand_tip(topic, tip, generate): Tip: structure your content effectively Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement. -Topic: """ - + topic - + "\nTip: " - + tip - + "\nParagraph:" - ) +Topic: """ + topic + "\nTip: " + tip + "\nParagraph:" return generate(s, max_tokens=128, stop=["\n\n"]) diff --git a/benchmark/tip_suggestion/bench_sglang.py b/benchmark/tip_suggestion/bench_sglang.py index 86c476f97fbf..ef78dce6985c 100644 --- a/benchmark/tip_suggestion/bench_sglang.py +++ b/benchmark/tip_suggestion/bench_sglang.py @@ -14,8 +14,7 @@ @sgl.function def expand_tip(s, topic, tip): - s += ( - """Please expand a tip for a topic into a detailed paragraph. + s += """Please expand a tip for a topic into a detailed paragraph. Topic: staying healthy Tip: Regular Exercise @@ -29,12 +28,7 @@ def expand_tip(s, topic, tip): Tip: structure your content effectively Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement. -Topic: """ - + topic - + "\nTip: " - + tip - + "\nParagraph:" - ) +Topic: """ + topic + "\nTip: " + tip + "\nParagraph:" s += sgl.gen("paragraph", max_tokens=128, stop=["\n\n"], temperature=0) diff --git a/benchmark/tip_suggestion/lmql_funcs.py b/benchmark/tip_suggestion/lmql_funcs.py index 7790bbe950d2..1d4c97e38c57 100644 --- a/benchmark/tip_suggestion/lmql_funcs.py +++ b/benchmark/tip_suggestion/lmql_funcs.py @@ -2,8 +2,7 @@ async def expand_tip_async(topic, tip, generate): - s = ( - """Please expand a tip for a topic into a detailed paragraph. + s = """Please expand a tip for a topic into a detailed paragraph. Topic: staying healthy Tip: Regular Exercise @@ -17,12 +16,7 @@ async def expand_tip_async(topic, tip, generate): Tip: structure your content effectively Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement. -Topic: """ - + topic - + "\nTip: " - + tip - + "\nParagraph:" - ) +Topic: """ + topic + "\nTip: " + tip + "\nParagraph:" return await generate(s, max_tokens=128, stop="\n\n") diff --git a/docker/Dockerfile b/docker/Dockerfile index d8a543c87c7a..a0a837ef464c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,8 +19,8 @@ ARG PIP_DEFAULT_INDEX ARG UBUNTU_MIRROR ARG GITHUB_ARTIFACTORY=github.com ARG INSTALL_FLASHINFER_JIT_CACHE=0 -ARG FLASHINFER_VERSION=0.6.2 -ARG MOONCAKE_VERSION=0.3.8.post1 +ARG FLASHINFER_VERSION=0.6.4 +ARG MOONCAKE_VERSION=0.3.9 #if need other arg please add in MOONCAKE_COMPILE_ARG ARG MOONCAKE_COMPILE_ARG="-DUSE_HTTP=ON -DUSE_MNNVL=ON -DUSE_CUDA=ON -DWITH_EP=ON" @@ -230,18 +230,21 @@ RUN set -eux; \ cd DeepEP && \ git checkout ${GRACE_BLACKWELL_DEEPEP_BRANCH} && \ sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \ + sed -i 's/#define NUM_TIMEOUT_CYCLES 200000000000ull/#define NUM_TIMEOUT_CYCLES 2000000000000ull/' csrc/kernels/configs.cuh && \ cd .. ; \ elif [ "$HOPPER_SBO" = "1" ]; then \ git clone https://github.com/deepseek-ai/DeepEP.git -b antgroup-opt && \ cd DeepEP && \ git checkout ${HOPPER_SBO_DEEPEP_COMMIT} && \ sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \ + sed -i 's/#define NUM_TIMEOUT_CYCLES 200000000000ull/#define NUM_TIMEOUT_CYCLES 2000000000000ull/' csrc/kernels/configs.cuh && \ cd .. ; \ else \ curl --retry 3 --retry-delay 2 -fsSL -o ${DEEPEP_COMMIT}.zip \ https://${GITHUB_ARTIFACTORY}/deepseek-ai/DeepEP/archive/${DEEPEP_COMMIT}.zip && \ unzip -q ${DEEPEP_COMMIT}.zip && rm ${DEEPEP_COMMIT}.zip && mv DeepEP-${DEEPEP_COMMIT} DeepEP && cd DeepEP && \ sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \ + sed -i 's/#define NUM_TIMEOUT_CYCLES 200000000000ull/#define NUM_TIMEOUT_CYCLES 2000000000000ull/' csrc/kernels/configs.cuh && \ cd .. ; \ fi diff --git a/docker/npu.Dockerfile b/docker/npu.Dockerfile index fd526649fd9e..937a0de14632 100644 --- a/docker/npu.Dockerfile +++ b/docker/npu.Dockerfile @@ -93,9 +93,9 @@ RUN git clone https://github.com/sgl-project/sglang --branch $SGLANG_TAG && \ RUN ${PIP_INSTALL} wheel==0.45.1 pybind11 pyyaml decorator scipy attrs psutil \ && mkdir sgl-kernel-npu \ && cd sgl-kernel-npu \ - && wget https://github.com/sgl-project/sgl-kernel-npu/releases/download/${SGLANG_KERNEL_NPU_TAG}/sgl-kernel-npu-${SGLANG_KERNEL_NPU_TAG}-${CANN_VERSION}-${DEVICE_TYPE}-$(arch).zip \ - && unzip sgl-kernel-npu-${SGLANG_KERNEL_NPU_TAG}-${CANN_VERSION}-${DEVICE_TYPE}-$(arch).zip \ - && ${PIP_INSTALL} output/deep_ep*.whl output/sgl_kernel_npu*.whl \ + && wget https://github.com/sgl-project/sgl-kernel-npu/releases/download/${SGLANG_KERNEL_NPU_TAG}/sgl-kernel-npu-${SGLANG_KERNEL_NPU_TAG}-torch2.8.0-py311-cann${CANN_VERSION}-${DEVICE_TYPE}-$(arch).zip \ + && unzip sgl-kernel-npu-${SGLANG_KERNEL_NPU_TAG}-torch2.8.0-py311-cann${CANN_VERSION}-${DEVICE_TYPE}-$(arch).zip \ + && ${PIP_INSTALL} deep_ep*.whl sgl_kernel_npu*.whl \ && cd .. && rm -rf sgl-kernel-npu \ && cd "$(python3 -m pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -sf deep_ep/deep_ep_cpp*.so diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile index 1357db036d29..712d7f5bdb4b 100644 --- a/docker/rocm.Dockerfile +++ b/docker/rocm.Dockerfile @@ -1,14 +1,20 @@ # Usage (to build SGLang ROCm docker image): -# docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx942 -t v0.5.8-rocm700-mi30x -f rocm.Dockerfile . -# docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx950 -t v0.5.8-rocm700-mi35x -f rocm.Dockerfile . +# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942 -t v0.5.9-rocm700-mi30x -f rocm.Dockerfile . +# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942-rocm720 -t v0.5.9-rocm720-mi30x -f rocm.Dockerfile . +# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950 -t v0.5.9-rocm700-mi35x -f rocm.Dockerfile . +# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950-rocm720 -t v0.5.9-rocm720-mi35x -f rocm.Dockerfile . # Usage (to build SGLang ROCm + Mori docker image): -# docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx942 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.8-rocm700-mi30x -f rocm.Dockerfile . -# docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx950 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.8-rocm700-mi35x -f rocm.Dockerfile . +# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm700-mi30x -f rocm.Dockerfile . +# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942-rocm720 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm720-mi30x -f rocm.Dockerfile . +# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm700-mi35x -f rocm.Dockerfile . +# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950-rocm720 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm720-mi35x -f rocm.Dockerfile . # Default base images ARG BASE_IMAGE_942="rocm/sgl-dev:rocm7-vllm-20250904" +ARG BASE_IMAGE_942_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1" ARG BASE_IMAGE_950="rocm/sgl-dev:rocm7-vllm-20250904" +ARG BASE_IMAGE_950_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1" # This is necessary for scope purpose ARG GPU_ARCH=gfx950 @@ -21,7 +27,17 @@ ENV BUILD_TRITON="0" ENV BUILD_LLVM="0" ENV BUILD_AITER_ALL="1" ENV BUILD_MOONCAKE="1" -ENV AITER_COMMIT="v0.1.9.post1" +ENV AITER_COMMIT="v0.1.11.post1" + +# =============================== +# Base image 942 with rocm720 and args +FROM $BASE_IMAGE_942_ROCM720 AS gfx942-rocm720 +ENV BUILD_VLLM="0" +ENV BUILD_TRITON="1" +ENV BUILD_LLVM="0" +ENV BUILD_AITER_ALL="1" +ENV BUILD_MOONCAKE="1" +ENV AITER_COMMIT="v0.1.11.post1" # =============================== # Base image 950 and args @@ -29,9 +45,20 @@ FROM $BASE_IMAGE_950 AS gfx950 ENV BUILD_VLLM="0" ENV BUILD_TRITON="0" ENV BUILD_LLVM="0" -ENV BUILD_AITER_ALL="0" +ENV BUILD_AITER_ALL="1" +ENV BUILD_MOONCAKE="1" +ENV AITER_COMMIT="v0.1.11.post1" + +# =============================== +# Base image 950 with rocm720 and args +FROM $BASE_IMAGE_950_ROCM720 AS gfx950-rocm720 +ENV BUILD_VLLM="0" +ENV BUILD_TRITON="1" +ENV BUILD_LLVM="0" +ENV BUILD_AITER_ALL="1" ENV BUILD_MOONCAKE="1" -ENV AITER_COMMIT="v0.1.9.post1" +ENV AITER_COMMIT="v0.1.11.post1" + # =============================== # Chosen arch and args FROM ${GPU_ARCH} @@ -39,6 +66,7 @@ FROM ${GPU_ARCH} # This is necessary for scope purpose, again ARG GPU_ARCH=gfx950 ENV GPU_ARCH_LIST=${GPU_ARCH%-*} +ENV PYTORCH_ROCM_ARCH=gfx942;gfx950 ARG SGL_REPO="https://github.com/sgl-project/sglang.git" ARG SGL_DEFAULT="main" @@ -47,8 +75,8 @@ ARG SGL_BRANCH=${SGL_DEFAULT} # Version override for setuptools_scm (used in nightly builds) ARG SETUPTOOLS_SCM_PRETEND_VERSION="" -ARG TRITON_REPO="https://github.com/ROCm/triton.git" -ARG TRITON_COMMIT="improve_fa_decode_3.0.0" +ARG TRITON_REPO="https://github.com/triton-lang/triton.git" +ARG TRITON_COMMIT="42270451990532c67e69d753fbd026f28fcc4840" ARG AITER_REPO="https://github.com/ROCm/aiter.git" @@ -70,17 +98,58 @@ ARG ENABLE_MORI=0 ARG NIC_BACKEND=none ARG MORI_REPO="https://github.com/ROCm/mori.git" -ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f" +ARG MORI_COMMIT="2f88d06aba75400262ca5c1ca5986cf1fdf4cd82" # AMD AINIC apt repo settings ARG AINIC_VERSION=1.117.5 ARG UBUNTU_CODENAME=jammy USER root +# Fix hipDeviceGetName returning empty string in ROCm 7.0 docker images. +# The ROCm 7.0 base image is missing libdrm-amdgpu-common which provides the +# amdgpu.ids device-ID-to-marketing-name mapping file. +# ROCm 7.2 base images already ship these packages, so this step is skipped. +# See https://github.com/ROCm/ROCm/issues/5992 +RUN set -eux; \ + case "${GPU_ARCH}" in \ + *rocm720*) \ + echo "ROCm 7.2 (GPU_ARCH=${GPU_ARCH}): libdrm-amdgpu packages already present, skipping"; \ + ;; \ + *) \ + echo "ROCm 7.0 (GPU_ARCH=${GPU_ARCH}): installing libdrm-amdgpu packages"; \ + curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key \ + | gpg --dearmor -o /etc/apt/keyrings/amdgpu-graphics.gpg \ + && echo 'deb [arch=amd64,i386 signed-by=/etc/apt/keyrings/amdgpu-graphics.gpg] https://repo.radeon.com/graphics/7.0/ubuntu jammy main' \ + > /etc/apt/sources.list.d/amdgpu-graphics.list \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + libdrm-amdgpu-common \ + libdrm-amdgpu-amdgpu1 \ + libdrm2-amdgpu \ + && rm -rf /var/lib/apt/lists/* \ + && cp /opt/amdgpu/share/libdrm/amdgpu.ids /usr/share/libdrm/amdgpu.ids; \ + ;; \ + esac + + # Install some basic utilities RUN python -m pip install --upgrade pip && pip install setuptools_scm RUN apt-get purge -y sccache; python -m pip uninstall -y sccache; rm -f "$(which sccache)" +# Install AMD SMI Python package from ROCm distribution. +# The ROCm 7.2 base image (rocm/pytorch) does not pre-install this package. +RUN set -eux; \ + case "${GPU_ARCH}" in \ + *rocm720*) \ + echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \ + cd /opt/rocm/share/amd_smi \ + && python3 -m pip install --no-cache-dir . \ + ;; \ + *) \ + echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip amdsmi installation"; \ + ;; \ + esac + WORKDIR /sgl-workspace # ----------------------- @@ -102,44 +171,43 @@ RUN if [ "$BUILD_LLVM" = "1" ]; then \ # (SETUPTOOLS_SCM_PRETEND_VERSION is set later for SGLang nightly builds and would otherwise # leak into AITER's version when AITER uses setuptools_scm) ENV SETUPTOOLS_SCM_PRETEND_VERSION= -RUN pip uninstall -y aiter +RUN pip uninstall -y aiter \ + && pip install flydsl==0.0.1.dev95158637 \ + && pip install psutil pybind11 # Required by AITER setup.py RUN git clone ${AITER_REPO} \ && cd aiter \ && git checkout ${AITER_COMMIT} \ && git submodule update --init --recursive + +# Hot patches for AITER in v0.1.10.post3 +# This is for ROCm 7.2 only, because of the image rebase from vllm +# to rocm/pytorch. +RUN set -eux; \ + case "${GPU_ARCH}" in \ + *rocm720*) \ + echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \ + cd aiter \ + && sed -i '459 s/if.*:/if False:/' aiter/ops/triton/attention/pa_mqa_logits.py; \ + ;; \ + *) \ + echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip patch"; \ + ;; \ + esac +# [WA] from kk-huang +# add sed -i '/c1 = torch.empty((M, D, S1 + S3) for aiter triton gemm config issue +# the corresponding pr is https://github.com/ROCm/aiter/pull/2173 +# it will be removed when server launched issue is fixed by aiter RUN cd aiter \ && echo "[AITER] GPU_ARCH=${GPU_ARCH}" \ + && sed -i '/c1 = torch.empty((M, D, S1 + S3), dtype=dtype, device=x.device)/i\ config = dict(config)' aiter/ops/triton/gemm/fused/fused_gemm_afp4wfp4_split_cat.py \ && if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \ sh -c "HIP_CLANG_PATH=/sgl-workspace/llvm-project/build/bin/ PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ elif [ "$BUILD_AITER_ALL" = "1" ]; then \ sh -c "PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ else \ sh -c "GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ - fi - -# ----------------------- -# Triton (TODO: remove this after Triton is no longer needed) -RUN if [ "$BUILD_TRITON" = "1" ]; then \ - pip uninstall -y triton \ - && git clone ${TRITON_REPO} \ - && cd triton \ - && git checkout ${TRITON_COMMIT} \ - && cd python \ - && python setup.py install; \ - fi - -# ----------------------- -# Build vLLM -ARG VLLM_REPO="https://github.com/ROCm/vllm.git" -ARG VLLM_BRANCH="9f6b92db47c3444b7a7d67451ba0c3a2d6af4c2c" -RUN if [ "$BUILD_VLLM" = "1" ]; then \ - git clone ${VLLM_REPO} \ - && cd vllm \ - && git checkout ${VLLM_BRANCH} \ - && python -m pip install -r requirements/rocm.txt \ - && python setup.py clean --all \ - && python setup.py develop; \ - fi + fi \ + && echo "export PYTHONPATH=/sgl-workspace/aiter:\${PYTHONPATH}" >> /etc/bash.bashrc # ----------------------- # Build Mooncake @@ -214,10 +282,10 @@ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ ENV CARGO_BUILD_JOBS=4 # Build and install sgl-model-gateway -RUN python3 -m pip install --no-cache-dir setuptools-rust \ +RUN python3 -m pip install --no-cache-dir maturin \ && cd /sgl-workspace/sglang/sgl-model-gateway/bindings/python \ - && /bin/bash -lc 'ulimit -n 8192 && cargo build --release' \ - && python3 -m pip install --no-cache-dir . \ + && ulimit -n 65536 && maturin build --release --features vendored-openssl --out dist \ + && python3 -m pip install --force-reinstall dist/*.whl \ && rm -rf /root/.cache # ----------------------- @@ -234,7 +302,7 @@ RUN /bin/bash -lc 'set -euo pipefail; \ libgtest-dev libgmock-dev \ libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev \ python3 python3-dev python3-setuptools python3-pip python3-apt \ - gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev \ + gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev vim \ cmake ninja-build pkg-config libstdc++6 software-properties-common \ && rm -rf /var/lib/apt/lists/*; \ \ @@ -278,8 +346,9 @@ RUN /bin/bash -lc 'set -euo pipefail; \ printf "#!/usr/bin/env bash\nexec \"%s\" \"\$@\"\n" "$LLVM_CONFIG_PATH" > /usr/local/bin/llvm-config-16 && \ chmod +x /usr/local/bin/llvm-config-16; \ \ - # TVM Python bits need Cython + z3 before configure - "$VENV_PIP" install --no-cache-dir "cython>=0.29.36,<3.0" "apache-tvm-ffi>=0.1.6" "z3-solver>=4.13.0"; \ + # TVM Python bits need Cython + z3 before configure. + # Pin z3-solver==4.15.4.0: 4.15.4.0 has a manylinux wheel; 4.15.5.0 has no wheel and builds from source (fails: C++20 needs GCC 14+, image has GCC 11). + "$VENV_PIP" install --no-cache-dir "cython>=0.29.36,<3.0" "apache-tvm-ffi @ git+https://github.com/apache/tvm-ffi.git@37d0485b2058885bf4e7a486f7d7b2174a8ac1ce" "z3-solver==4.15.4.0"; \ \ # Clone + pin TileLang (bundled TVM), then build git clone --recursive "${TILELANG_REPO}" /opt/tilelang && \ @@ -305,7 +374,8 @@ RUN /bin/bash -lc 'set -euo pipefail; \ # Python tools RUN python3 -m pip install --no-cache-dir \ py-spy \ - pre-commit + pre-commit \ + tabulate # ----------------------- # MORI (optional) @@ -374,9 +444,95 @@ RUN /bin/bash -lc 'set -euo pipefail; \ echo "export PYTHONPATH=/sgl-workspace/mori:\${PYTHONPATH}" >> /etc/bash.bashrc; \ echo "[MORI] Done."' +# ----------------------- +# Hot patch: torch-ROCm +# The artifact hardcoded the supported triton version to be 3.5.1. +# Rewrite the restriction directly. +ARG TORCH_ROCM_FILE="torch-2.9.1+rocm7.2.0.lw.git7e1940d4-cp310-cp310-linux_x86_64.whl" +RUN mkdir /tmp/whl && cd /tmp/whl \ + && export TORCH_ROCM_FILE="${TORCH_ROCM_FILE}" \ + && cat > hack.py <<"PY" +import zipfile, csv, os, re +from pathlib import Path + +fname = os.environ["TORCH_ROCM_FILE"] +in_whl = Path("/") / fname +out_whl = Path("/tmp")/ fname +work = Path("/tmp/whl") + +# 1) Extract +with zipfile.ZipFile(in_whl, "r") as z: + z.extractall(work) + +# 2) Locate dist-info and patch METADATA (edit this logic to match your exact line) +dist_info = next(work.glob("*.dist-info")) +meta = dist_info / "METADATA" +txt = meta.read_text(encoding="utf-8") + +# Example: replace one exact requirement form. +# Adjust the string to match what you actually see. +pat = r"^Requires-Dist:\s*triton==3.5.1[^\s]*;" +txt2, n = re.subn(pat, r"triton>=3.5.1;", txt, flags=re.MULTILINE) +if txt2 == txt: + raise SystemExit("Did not find expected Requires-Dist line to replace in METADATA") +meta.write_text(txt2, encoding="utf-8") + +# 3) Hacky step: blank hash/size columns in RECORD +record = dist_info / "RECORD" +rows = [] +with record.open(newline="", encoding="utf-8") as f: + for r in csv.reader(f): + if not r: + continue + # keep filename, blank out hash and size + rows.append([r[0], "", ""]) +with record.open("w", newline="", encoding="utf-8") as f: + csv.writer(f).writerows(rows) + +# 4) Re-zip as a wheel +with zipfile.ZipFile(out_whl, "w", compression=zipfile.ZIP_DEFLATED) as z: + for p in work.rglob("*"): + if p.is_file(): + z.write(p, p.relative_to(work).as_posix()) + +print("Wrote", out_whl) +PY + +RUN cd /tmp/whl \ + && case "${GPU_ARCH}" in \ + *rocm720*) \ + echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \ + python hack.py \ + && python3 -m pip install --force --no-deps /tmp/${TORCH_ROCM_FILE} \ + && rm -fr /tmp/whl /tmp/${TORCH_ROCM_FILE} \ + ;; \ + *) \ + echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip patch"; \ + ;; \ + esac + + +# ----------------------- +# Hot patch: Triton +# For ROCm 7.2, this custom build breaks pip dependency management, +# so future `pip install` will break the ROCm stack. +# A workaround for this is to reinstall the default triton +# wheel with the `rocm/pytorch` image in the root directory. +RUN if [ "$BUILD_TRITON" = "1" ]; then \ + pip uninstall -y triton \ + && apt install -y cmake \ + && git clone ${TRITON_REPO} triton-custom \ + && cd triton-custom \ + && git checkout ${TRITON_COMMIT} \ + && pip install -r python/requirements.txt \ + && pip install -e .; \ + fi + # ----------------------- # Performance environment variable. +# Skip CuDNN compatibility check - not applicable for ROCm (uses MIOpen instead) +ENV SGLANG_DISABLE_CUDNN_CHECK=1 ENV HIP_FORCE_DEV_KERNARG=1 ENV HSA_NO_SCRATCH_RECLAIM=1 ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 @@ -389,10 +545,7 @@ ENV SGLANG_USE_AITER=1 ENV SGLANG_USE_ROCM700A=1 ENV NCCL_MIN_NCHANNELS=112 -ENV VLLM_FP8_PADDING=1 -ENV VLLM_FP8_ACT_PADDING=1 -ENV VLLM_FP8_WEIGHT_PADDING=1 -ENV VLLM_FP8_REDUCE_CONV=1 +ENV ROCM_QUICK_REDUCE_QUANTIZATION=INT8 ENV TORCHINDUCTOR_MAX_AUTOTUNE=1 ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1 diff --git a/docs/Makefile b/docs/Makefile index a15ce24d9089..716160e56684 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -44,17 +44,22 @@ markdown: echo "Exporting docs to Markdown..."; \ mkdir -p "$(BUILDDIR)/html/markdown"; \ \ - # 1) Copy .md and .rst files under subdirectories only (exclude root-level files) \ - find $(SOURCEDIR) -mindepth 2 -path "*/_build/*" -prune -o \( -name "*.md" -o -name "*.rst" \) -print0 | \ + # 1) Copy .md and .rst files as-is; additionally convert .rst -> .md \ + find $(SOURCEDIR) -path "*/_build/*" -prune -o \( -name "*.md" -o -name "*.rst" \) -print0 | \ parallel -0 -j3 --halt soon,fail=1 ' \ SRC="{}"; \ REL_DIR=$$(dirname "$$SRC"); \ OUT_DIR="$(BUILDDIR)/html/markdown/$$REL_DIR"; \ mkdir -p "$$OUT_DIR"; \ cp -f "$$SRC" "$$OUT_DIR/"; \ + case "$$SRC" in \ + *.rst) \ + BASE=$$(basename "$$SRC" .rst); \ + pandoc -f rst -t gfm "$$SRC" -o "$$OUT_DIR/$$BASE.md" ;; \ + esac \ ' || exit 1; \ \ - # 2) Convert .ipynb -> .md (all notebooks, including root if any) \ + # 2) Convert .ipynb -> .md \ find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print0 | \ parallel -0 -j3 --halt soon,fail=1 ' \ NB_SRC="{}"; \ @@ -72,6 +77,7 @@ markdown: echo "Markdown artifacts written to: $(BUILDDIR)/html/markdown" + # Serve documentation with auto-build and live reload serve: @echo "Starting auto-build server at http://0.0.0.0:$(PORT)" diff --git a/docs/_static/image/dpa.png b/docs/_static/image/dpa.png new file mode 100644 index 000000000000..672e022186e4 Binary files /dev/null and b/docs/_static/image/dpa.png differ diff --git a/docs/advanced_features/attention_backend.md b/docs/advanced_features/attention_backend.md index 046c125d32a4..af163fc8b236 100644 --- a/docs/advanced_features/attention_backend.md +++ b/docs/advanced_features/attention_backend.md @@ -49,10 +49,14 @@ Multimodal attention is selected by `--mm-attention-backend`. The "MultiModal" c ``` ```{note} -- FlashAttention 4 is prefill-only for now. +- FlashAttention 4 supports both prefill and decode on SM90 (Hopper) and SM100 (Blackwell). On SM90, `page_size` must be 128. - NSA is specifically designed for [DeepSeek V3.2 DSA](https://lmsys.org/blog/2025-09-29-deepseek-V32/). ``` +```{warning} +**FA4 on Hopper (SM90):** FA4 decode speed decreases as sequence length grows due to lack of SplitKV support. At batch=1 compared to FA3 on H100: ~-10% at 2K tokens, ~-18% at 4K, ~-31% at 8K, ~-49% at 16K. Larger batch sizes reduce the gap (e.g., batch=8: ~-2% at 2K, ~-8% at 4K). Blackwell (SM100) is not affected. +``` + ```{note} For the KV4 FA4 scenario, FA4 requires using a different --decode-attention-backend to run. Except for trtllm_mha being incompatible with FA4, all other decode backends behave as shown in the table. ``` @@ -73,6 +77,28 @@ MLA page-size constraints: - Cutlass MLA: page_size = 128. - TRTLLM MLA: page_size ∈ {32, 64}. +### GDN Attention Backends + +GDN (Gated Delta Network) is a linear attention mechanism with O(n) complexity, used in hybrid models that alternate GDN linear attention layers with standard full attention layers. GDN is **not** selected via `--attention-backend`; it is automatically activated when the model architecture requires it (e.g., Qwen 3.5, Qwen 3 Next, Jet Nemotron, Jet VLM). + +The GDN linear attention layers have their own kernel backends, selected via `--linear-attn-backend` (default: `triton`). You can override the kernel per phase with `--linear-attn-decode-backend` and `--linear-attn-prefill-backend`. + +| **Backend** | **Decode** | **Prefill / Extend** | **Spec Decoding (Target Verify)** | +|--------------------------|------------|----------------------|-----------------------------------| +| **Triton (CUDA)** | ✅ | ✅ | ✅ | +| **Triton (AMD/ROCm)** | ✅ | ✅ | ✅ | +| **Triton (NPU)** | ✅ | ✅ | ❌ | +| **Triton (CPU)** | ✅ | ✅ | ❌ | +| **CuTe DSL (CUDA only)**| ✅ | ❌ | ❌ | + +```{important} +GDN models are hybrid: the full-attention layers still require a standard `--attention-backend`. Platform constraints for the full-attention backend on hybrid GDN models: +- **Blackwell (e.g., B200)**: `triton`, `trtllm_mha`, or `fa4` only. +- **NPU (Ascend)**: `ascend` only. +- **AMD (ROCm)**: `triton` recommended. +- **Other CUDA (Hopper, Ampere, etc.)**: auto-selection works; no special constraints. +``` + ### Hybrid attention (different backends for prefill vs decode) (Experimental) ```{warning} @@ -202,8 +228,34 @@ python3 -m sglang.launch_server \ --trust-remote-code ``` +- TRTLLM MHA (Optimized for Blackwell Architecture, e.g., B200) +```bash +python3 -m sglang.launch_server \ + --tp 4 \ + --model Qwen/Qwen3.5-35B-A3B-FP8 \ + --attention-backend trtllm_mha \ + --trust-remote-code +``` + +- TRTLLM MHA (XQA backend) (Optimized for SM90 and SM120, e.g., H20, H200, 5090) +Note that TRTLLM XQA backend only works well for pagesize 64. +```bash +python3 -m sglang.launch_server \ + --tp 4 \ + --model Qwen/Qwen3.5-35B-A3B-FP8 \ + --decode-attention-backend trtllm_mha \ + --trust-remote-code +``` + - FlashAttention 4 (MHA & MLA) ```bash +# FA4 for both prefill and decode on SM90/SM100 +python3 -m sglang.launch_server \ + --model-path Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 \ + --attention-backend fa4 \ + --page-size 128 \ + --trust-remote-code + python3 -m sglang.launch_server \ --tp 8 \ --model deepseek-ai/DeepSeek-R1 \ @@ -267,6 +319,10 @@ To add a new attention backend, you can learn from the existing backends (`python/sglang/srt/layers/attention/triton_backend.py`, `python/sglang/srt/layers/attention/flashattention_backend.py`) and follow the steps below. +```{note} +Linear attention kernel backends (GDN, KDA) follow a different pattern. They implement `LinearAttnKernelBase` in `python/sglang/srt/layers/attention/linear/kernels/` and are dispatched by `GDNKernelDispatcher` / `KDAKernelDispatcher` rather than registered via `@register_attention_backend`. +``` + 1. Run without cuda graph. Support the two forward functions - forward_extend - Will be used for prefill, prefill with KV cache, and target verification diff --git a/docs/advanced_features/dp_dpa_smg_guide.md b/docs/advanced_features/dp_dpa_smg_guide.md new file mode 100644 index 000000000000..9ec5df64856e --- /dev/null +++ b/docs/advanced_features/dp_dpa_smg_guide.md @@ -0,0 +1,373 @@ +# DP, DPA and SGLang DP Router + +This guide explains the difference between Data Parallelism (DP) and Data Parallelism Attention (DPA), how to enable each mode correctly, and how to use the SGLang Model Gateway (SMG) for production-grade DP deployments. + +## Data Parallelism (DP) + +**Data Parallelism (DP)** is the most common parallelism strategy that replicates the entire model across multiple GPU sets and processes different batches of requests in parallel. Each GPU set handles independent requests. With dedicated routing strategies, as we will introduce later, with those proper routing algorithms in SGLang Model Gateway, the throughput of your serving system could be multiplied nearly linearly. + +### Key characteristics + +- Each replica has a full copy of the model +- Requests are distributed/scattered across replicas +- No inter-replica communication during one request's inference (for simple DP) + +## Data Parallelism Attention (DPA) + +**Data Parallelism Attention (DPA)**, also known as DP Attention, is an advanced parallelism strategy. While DPA provides the most significant benefits for **Multi-Head Latent Attention (MLA)** models (such as DeepSeek, MiniMax, Kimi-K2), it also supports **standard attention models** like Qwen. + +### The Problem with Tensor Parallelism for MLA Models + +The most common parallelism strategy for inference is **Tensor Parallelism (TP)**. However, TP might not be the most efficient strategy for certain models. For example, DeepSeek models use MLA and only have **one KV head**. If we use tensor parallelism on 8 GPUs, it will lead to: + +- **Duplicated KV cache** across all GPUs +- **Unwanted memory usage** that limits batch size +- **Reduced throughput** due to memory constraints + +### How DPA Works + +DPA addresses these limitations by applying **data parallelism specifically to the attention component**. + + + + + + +
+DPA + EP Architecture + + +**Each DP replica:** + +- Processes different batches independently (can be in different forward modes: prefill, decode, or idle) +- Maintains its own KV cache (no duplication) +- Enables significantly larger batch sizes due to memory savings + +**Communication patterns in DPA + EP:** +- +- **All2All (Dispatch)**: Routes tokens to expert sub-groups based on gating decisions +- **All2All (Combine)**: Gathers computed results from experts back to original token positions + +
+ +### Key benefits of DPA + +1. **Significantly reduced KV cache memory**: Each DP replica only stores KV cache for its own batches +2. **Larger batch sizes**: Memory savings enable larger batch sizes +3. **Improved decoding throughput**: Significant throughput gains for MLA-based models +4. **Independent forward modes**: Each DP replica can be in different forward modes (prefill, decode, or idle) and handles its assigned batches independently during attention computation + +### DPA with Expert Parallelism for MoE + +For MoE models like DeepSeek, DPA is **often** paired with Expert Parallelism (EP) for best throughput at scale. However, **DPA does not require EP**: you can enable DPA without EP if your deployment does not need expert sharding. + +- Distribute 256+ expert weights across GPUs (cannot fit on a single GPU) +- Enable efficient all-to-all token routing via DeepEP +- Scale to large clusters (up to 5x throughput improvement over vanilla TP) + +### Recommended setup for DeepSeek + +```bash +python -m sglang.launch_server \ + --model-path deepseek-ai/DeepSeek-V3 \ + --tp 8 \ + --dp-size 8 \ + --ep 8 \ + --enable-dp-attention \ + --moe-a2a-backend deepep \ + --moe-runner-backend deep_gemm +``` + +> **Note**: `--dp-size` must be explicitly set when using `--enable-dp-attention`. If `dp_size` is 1 (default), DPA will be disabled. + +For detailed EP configuration (DeepEP, Two-Batch Overlap, EPLB), see [Expert Parallelism](expert_parallelism.md). + +### Target Models + +DPA supports the following model architectures: + +- **MLA (Multi-Head Latent Attention) models** - where DPA provides the most significant benefits: + - DeepSeek family (DeepSeek-V2, DeepSeek-V3, DeepSeek-R1) + - MiniMax models + - Kimi-K2 + - Other models using MLA architecture + +- **Standard attention models** - also supported: + - Qwen models (see [PR #6121](https://github.com/sgl-project/sglang/pull/6121)) + +For models like Llama, with standard GQA, standard DP, or TP is typically recommended. + +To enable DPA, add `--enable-dp-attention` to your server launch command. + +### Activation Logic + +DPA is enabled explicitly via server arguments (CLI or config). You must set both `--dp-size` and `--enable-dp-attention`: + +```bash +python -m sglang.launch_server \ + --model-path deepseek-ai/DeepSeek-V3 \ + --tp 8 \ + --dp-size 8 \ + --enable-dp-attention +``` + +**Important**: `--dp-size` must be greater than 1 for DPA to work. When `dp_size == 1` (default), `--enable-dp-attention` is automatically disabled. The constraint `tp_size % dp_size == 0` must also be satisfied. + +### Standard DP for MLA models + +Note that MLA models, of course, also support DP. Suppose you want to enable standard DP for MLA models. First, launch each MLA model's replica independently. You may launch these replicas one by one with DPA enabled. After launching each MLA model's replica, launch an SMG and connect all the replicas to the SMG. A detailed explanation of SMG is as follows. + +## Modern Data Parallelism SGLang Model Gateway (SMG) + +### Native DP Mode + +Native DP (built-in Data Parallelism) in SGLang creates multiple worker processes within a single SGLang instance, under the control of `DataParallelController` with the launching parameter of `dp-size`. + + +```bash +# Native DP mode +python -m sglang.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --dp-size 4 +``` + +**Limitations:** + +- Built-in in-process load balancing only (e.g., `round_robin`, `total_requests`, `total_tokens`) +- No cache-aware routing +- Limited observability and metrics +- No fault tolerance or circuit breakers +- Not suitable for production workloads + +⚠️ Native DP is **highly not recommended for use right now**. It is only used in some ancient/outdated RL frameworks. You can use SGLang Model Gateway (SMG) to power up your data parallelism in any use case. + +### SMG-Based DP (Recommended) + +Starting from September 2024, SGLang Model Gateway, i.e., SMG, formerly named as SGLang DP Router, was built especially as a production-ready DP routing system with Rust. It starts from DP routing, but later we further expanded its scope to coordinate RL, PD Disaggregation, and other scenarios. This doc only discusses SMG's usage in DP routing. For other usage, please refer to [SGLang Model Gateway Documentation](sgl_model_gateway.md). + +> To achieve the best production-level routing performance and reduce the overhead to an extreme extent, we use Rust to build SMG, but not Python, since Python is never FAST enough. + +**We strongly recommend using the SGLang Model Gateway (SMG) for production-grade Data Parallelism.** SMG provides significant advantages over native DP mode. + +```bash +# SMG-based DP mode (Recommended) +python -m sglang_router.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --dp-size 4 +``` + +⚠️ Note that **SMG and Naive DP share the same launching parameter, `--dp-size`**. But the entrypoint of Naive DP is `python -m sglang.launch_server`, and SMG's entrypoint is `python -m sglang_router.launch_server`. + +**Advantages of SMG-Based DP:** + +| Feature | Native DP | SMG-Based DP | +|---------|-----------|--------------| +| **Load Balancing** | Built-in in-process methods | Advanced policies (cache-aware, power-of-two, etc.) | +| **Cache Awareness** | ❌ No | ✅ Yes - significantly higher cache hit rate | +| **Throughput** | Baseline | Significant improvement | +| **Multi-Node Support** | Limited | ✅ Full support | +| **Worker Health Monitoring** | Basic | ✅ Circuit breakers, health checks | +| **Reliability** | Basic | ✅ Retries, rate limiting, queuing | +| **Observability** | Basic metrics | ✅ 40+ Prometheus metrics, OpenTelemetry | +| **Hot Worker Add/Remove** | ❌ No | ✅ Yes | + +### SMG's Performance + +The cache-aware routing policy in SMG significantly improves performance for workloads with shared prefixes: + +| Metric | Without Cache-Aware | With Cache-Aware SMG | +|--------|---------------------|----------------------| +| Throughput (token/s) | 82,665 | 158,596 (+92%) | +| Cache Hit Rate | 20% | 75% (+275%) | + +*Benchmark from [SGLang v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), workload with multiple long prefix groups, 8x A100 80GB GPUs, dp-size=8* + +### When to Use Each + +**Use Native DP when:** + +- ~Never use Native/Naive DP~ +- Learning material of DP routing + +**Use SMG-Based DP when:** + +- In any case, when you think DP is needed +- Production deployments +- Multi-node distributed setups +- Workloads with shared prefixes (high cache reuse potential) +- You need high availability and reliability features +- You require detailed observability and metrics +- You want to have highly efficient RL rollout systems + +Note that for RL rollout systems, **there are four crucial reasons that SMG-Based DP is far better than naive DP routing**. Details can be found at [Load Balancing Router in RL](./sglang_for_rl.md#load-balancing-router). + +### Quick Start For SMG + +**Installation** + +```bash +pip install sglang-router +# or +pip install "sglang[all]" +``` + +**Option A: Co-launch Workers and SMG (Simplest)** + +This is the easiest way to get started - SMG and workers are launched together: + +```bash +python -m sglang_router.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --dp-size 4 \ + --host 0.0.0.0 \ + --port 30000 +``` + +**Option B: Separate Launch (Multi-Node)** + +For distributed deployments across multiple machines: + +1. Launch workers on each node + +```bash +# Node 1 +python -m sglang.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8000 + +# Node 2 +python -m sglang.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8000 +``` + +2. Launch SMG pointing to workers + +```bash +python -m sglang_router.launch_router \ + --worker-urls http://node1:8000 http://node2:8000 \ + --policy cache_aware \ + --host 0.0.0.0 \ + --port 30000 +``` + +**Option C: Dynamic Worker Registration** + +For elastic deployments where workers can be added/removed dynamically: + +```bash +# Launch SMG first +python -m sglang_router.launch_router \ + --policy cache_aware \ + --host 0.0.0.0 \ + --port 30000 + +# Register workers dynamically +curl -X POST http://localhost:30000/workers \ + -H "Content-Type: application/json" \ + -d '{"url": "http://worker1:8000"}' + +curl -X POST http://localhost:30000/workers \ + -H "Content-Type: application/json" \ + -d '{"url": "http://worker2:8000"}' +``` + +### Load Balancing Policies + +SMG supports multiple load balancing policies: + +| Policy | Description | Best For | +|--------|-------------|----------| +| `cache_aware` | Combines cache locality with load balancing | **Recommended for most workloads** | +| `round_robin` | Cycles through workers in order | Simple, predictable distribution | +| `random` | Random worker selection | Baseline, testing | +| `power_of_two` | Samples two workers, picks lighter one | Low latency requirements | + +**Cache-Aware Policy (Default, Recommended)** + +The cache-aware policy provides the best performance for most workloads: + +```bash +python -m sglang_router.launch_router \ + --worker-urls http://worker1:8000 http://worker2:8000 \ + --policy cache_aware \ + --cache-threshold 0.5 \ + --balance-abs-threshold 32 \ + --balance-rel-threshold 1.5 \ + --eviction-interval-secs 120 \ + --max-tree-size 67108864 +``` + +**How it works:** + +1. Maintains an approximate radix tree for each worker based on request history +2. Routes requests to workers with the highest prefix match (cache hit) +3. Falls back to shortest-queue routing when load is imbalanced +4. Automatically evicts old entries to prevent memory overflow + +### Best Practices + +1. **Start with `cache_aware` policy** - It provides the best balance between cache locality and load distribution for most workloads +2. **Use SMG for production** - Prefer `sglang_router.launch_server` over `sglang.launch_server` for better reliability and observability +3. **Enable health checks** - Configure `--router-health-check-interval-secs` to detect and remove unhealthy workers automatically + +**Recommended command with best practices applied:** + +```bash +python -m sglang_router.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --dp-size 4 \ + --router-policy cache_aware \ + --router-health-check-interval-secs 30 \ + --router-prometheus-port 10001 \ + --host 0.0.0.0 \ + --port 30000 +``` + +For advanced configuration (circuit breakers, retries, Prometheus metrics, K8s integration), see [SGLang Model Gateway Documentation](sgl_model_gateway.md). + +### Verifying Traffic Distribution + +After launching SMG, verify that traffic is being distributed correctly: + +**1. Check worker status:** + +```bash +curl http://localhost:30000/workers +``` + +**2. Check load distribution:** + +```bash +curl http://localhost:30000/get_loads +``` + +**3. Monitor metrics (if Prometheus enabled):** + +```bash +# Key metrics to check +smg_router_requests_total{model="..."} +smg_worker_requests_active{worker="..."} +sglang_cache_hit_rate{source="..."} +``` + +For detailed metrics and monitoring setup, see [SGLang Model Gateway Documentation](sgl_model_gateway.md). + +## Reference + +| Strategy | Use Case | Key Benefit | +|----------|----------|-------------| +| **Native DP** (`--dp-size`) | Never | Easy to understand, not rust based | +| **SMG-Based DP** | **Production (recommended)** | Cache-aware routing, high availability | +| **DPA** (`--dp-size N --enable-dp-attention`) | DeepSeek/MLA models | Eliminates KV cache duplication, improved throughput | +| **DPA + EP** | DeepSeek MoE models | Significant throughput improvement vs vanilla TP | + +**Recommended production setup for DeepSeek:** +1. Enable **DPA** for attention layers (`--dp-size 8 --enable-dp-attention`) +2. Enable **EP** for MoE layers (`--ep 8 --moe-a2a-backend deepep`) +3. Use **SMG** with **cache_aware** policy + +**Related documentation:** +- [Expert Parallelism](expert_parallelism.md) - DeepEP, Two-Batch Overlap, EPLB +- [SGLang Model Gateway Documentation](sgl_model_gateway.md) - SMG configuration & troubleshooting +- [Large-Scale EP Blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/) - 96 GPU deployment guide diff --git a/docs/advanced_features/dp_for_multi_modal_encoder.md b/docs/advanced_features/dp_for_multi_modal_encoder.md index 62057f9581a0..a100e0688439 100644 --- a/docs/advanced_features/dp_for_multi_modal_encoder.md +++ b/docs/advanced_features/dp_for_multi_modal_encoder.md @@ -4,7 +4,7 @@ A typical VLM architecture involves two main components: an multi-modal encoder Most VLMs utilize a Vision Transformer (ViT) as their multi-modal encoder, it is responsible for processing visual data, extracting features (objects, colors, textures, etc.), and transforming them into a format that can be understood by the model. -The text deocoder is based on LLM. It processes textual data and generates output based on the encoded visual features. +The text decoder is based on LLM. It processes textual data and generates output based on the encoded visual features. However, since the size of ViT is very small compared to language decoders, there is relatively little gain from TP. On the other hand, TP incurs significant communication diff --git a/docs/advanced_features/epd_disaggregation.md b/docs/advanced_features/epd_disaggregation.md index 550503dfc930..c543c29bc545 100644 --- a/docs/advanced_features/epd_disaggregation.md +++ b/docs/advanced_features/epd_disaggregation.md @@ -78,3 +78,42 @@ python -m sglang_router.launch_router \ --port 8000 ``` + +#### gRPC Encoder (EPD) + +You can run the encoder as a gRPC server while keeping prefill/decode as HTTP. +When using gRPC encoders, set `SGLANG_ENCODER_MM_RECEIVER_MODE=grpc` for the +prefill process so it uses the gRPC receiver. + +```bash +# gRPC encoder +python -m sglang.launch_server \ + --model-path Qwen/Qwen3-VL-8B-Instruct \ + --encoder-only \ + --grpc-mode \ + --encoder-transfer-backend zmq_to_scheduler \ + --port 30000 + +# prefill (HTTP) - tell it to use gRPC receiver +SGLANG_ENCODER_MM_RECEIVER_MODE=grpc \ +python -m sglang.launch_server \ + --model-path Qwen/Qwen3-VL-8B-Instruct \ + --disaggregation-mode prefill \ + --language-only \ + --encoder-urls grpc://127.0.0.1:30000 \ + --encoder-transfer-backend zmq_to_scheduler \ + --port 30002 + +# decode (HTTP) +python -m sglang.launch_server \ + --model-path Qwen/Qwen3-VL-8B-Instruct \ + --disaggregation-mode decode \ + --port 30003 + +# router +python -m sglang_router.launch_router \ + --pd-disaggregation \ + --prefill http://$PREFILL_HOST:30002 \ + --decode http://$DECODE_HOST:30003 \ + --port 8000 +``` diff --git a/docs/advanced_features/hicache_best_practices.md b/docs/advanced_features/hicache_best_practices.md index 02749530ae6e..104c2b0e2d54 100644 --- a/docs/advanced_features/hicache_best_practices.md +++ b/docs/advanced_features/hicache_best_practices.md @@ -39,6 +39,23 @@ Notes: - `page_first`: Only compatible with `kernel` I/O backend, automatically switches to `layer_first` with `direct` backend - `page_first_direct`: Specifically designed for `direct` I/O backend with optimized memory organization +### Heterogeneous TP Support (GQA/MHA models) + +HiCache storage supports cross-cluster KV reuse when different deployments use different TP sizes (for example, `tp=4` and `tp=8`) and share the same storage backend namespace. + +Use `tp_lcm_size` in `--hicache-storage-backend-extra-config`: + +```bash +# Example: heterogeneous TP = {4, 8}, so lcm = 8 +--hicache-storage-backend-extra-config '{"tp_lcm_size": 8}' +``` + +Guidelines: + +- Set `tp_lcm_size` to the least common multiple (LCM) of all TP sizes that will share the same HiCache storage. +- For MHA models with Mooncake and `page_head` layout, HiCache will split head shards based on `tp_lcm_size` to make keys reusable across heterogeneous TP deployments. +- If all clusters use the same TP size, this option is not needed. + ### Prefetch Policies ```bash diff --git a/docs/advanced_features/lora.ipynb b/docs/advanced_features/lora.ipynb index a8245f1b280c..8e6e6d0a02af 100644 --- a/docs/advanced_features/lora.ipynb +++ b/docs/advanced_features/lora.ipynb @@ -102,7 +102,7 @@ "\"\"\"\n", ")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=server_process)" ] }, { @@ -151,18 +151,16 @@ "metadata": {}, "outputs": [], "source": [ - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "server_process, port = launch_server_cmd(\"\"\"\n", "python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", " --enable-lora \\\n", " --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n", - " lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n", + " lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_SFT_lora_4_alpha_16_humaneval_raw_json \\\n", " --max-loras-per-batch 2 \\\n", " --log-level warning \\\n", - "\"\"\"\n", - ")\n", + "\"\"\")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=server_process)" ] }, { @@ -220,15 +218,14 @@ "metadata": {}, "outputs": [], "source": [ - "lora0 = \"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\" # rank - 4, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj\n", + "lora0 = \"Nutanix/Meta-Llama-3.1-8B-Instruct_SFT_lora_4_alpha_16_humaneval_raw_json\" # rank - 4, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj\n", "lora1 = \"algoprog/fact-generation-llama-3.1-8b-instruct-lora\" # rank - 64, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj\n", "lora0_new = \"philschmid/code-llama-3-1-8b-text-to-sql-lora\" # rank - 256, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj\n", "\n", "\n", "# The `--target-lora-modules` param below is technically not needed, as the server will infer it from lora0 which already has all the target modules specified.\n", "# We are adding it here just to demonstrate usage.\n", - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "server_process, port = launch_server_cmd(\"\"\"\n", " python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", " --enable-lora \\\n", " --cuda-graph-max-bs 2 \\\n", @@ -236,11 +233,10 @@ " --max-lora-rank 256\n", " --lora-target-modules all\n", " --log-level warning\n", - " \"\"\"\n", - ")\n", + " \"\"\")\n", "\n", "url = f\"http://127.0.0.1:{port}\"\n", - "wait_for_server(url)" + "wait_for_server(url, process=server_process)" ] }, { @@ -435,8 +431,7 @@ "metadata": {}, "outputs": [], "source": [ - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "server_process, port = launch_server_cmd(\"\"\"\n", " python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", " --enable-lora \\\n", " --cuda-graph-max-bs 8 \\\n", @@ -444,16 +439,15 @@ " --max-lora-rank 256 \\\n", " --lora-target-modules all \\\n", " --lora-paths \\\n", - " {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\",\"pinned\":true} \\\n", + " {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_SFT_lora_4_alpha_16_humaneval_raw_json\",\"pinned\":true} \\\n", " {\"lora_name\":\"lora1\",\"lora_path\":\"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"} \\\n", " lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora\n", " --log-level warning\n", - " \"\"\"\n", - ")\n", + " \"\"\")\n", "\n", "\n", "url = f\"http://127.0.0.1:{port}\"\n", - "wait_for_server(url)" + "wait_for_server(url, process=server_process)" ] }, { @@ -548,16 +542,14 @@ "metadata": {}, "outputs": [], "source": [ - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "server_process, port = launch_server_cmd(\"\"\"\n", " python3 -m sglang.launch_server \\\n", " --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", " --enable-lora \\\n", " --lora-backend csgmv \\\n", " --max-loras-per-batch 16 \\\n", " --lora-paths lora1=path/to/lora1 lora2=path/to/lora2\n", - " \"\"\"\n", - ")" + " \"\"\")" ] }, { @@ -589,28 +581,26 @@ "metadata": {}, "outputs": [], "source": [ - "lora0 = \"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\"\n", + "lora0 = \"Nutanix/Meta-Llama-3.1-8B-Instruct_SFT_lora_4_alpha_16_humaneval_raw_json\"\n", "lora1 = \"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"\n", "lora2 = \"philschmid/code-llama-3-1-8b-text-to-sql-lora\"\n", "\n", "\n", - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "server_process, port = launch_server_cmd(\"\"\"\n", " python3 -m sglang.launch_server \\\n", " --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", " --enable-lora \\\n", " --enable-lora-overlap-loading \\\n", - " --lora-paths lora0=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n", + " --lora-paths lora0=Nutanix/Meta-Llama-3.1-8B-Instruct_SFT_lora_4_alpha_16_humaneval_raw_json \\\n", " lora1=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n", " lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora \\\n", " --max-lora-rank 256 \\\n", " --max-loras-per-batch 2 \\\n", " --max-loaded-loras 4\n", - " \"\"\"\n", - ")\n", + " \"\"\")\n", "\n", "url = f\"http://127.0.0.1:{port}\"\n", - "wait_for_server(url)" + "wait_for_server(url, process=server_process)" ] }, { diff --git a/docs/advanced_features/pd_disaggregation.md b/docs/advanced_features/pd_disaggregation.md index b40ab11b4d01..17b81b86368e 100644 --- a/docs/advanced_features/pd_disaggregation.md +++ b/docs/advanced_features/pd_disaggregation.md @@ -130,16 +130,19 @@ PD Disaggregation with Mooncake supports the following environment variables for To enable NVLink transport for KV cache transfers with the mooncake backend (recommended for NVL72 deployments), set the following environment variables. Note that auxiliary data transfer will still use TCP as a temporary workaround. ```bash -export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True +export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=NVLINK export MC_FORCE_MNNVL=True ``` +The `SGLANG_MOONCAKE_CUSTOM_MEM_POOL` environment variable enables the custom memory pool. Supported values are `NVLINK` (or `True`), `BAREX`, and `INTRA_NODE_NVLINK`. + #### Prefill Server Configuration | Variable | Description | Default | |:--------:|:-----------:|:--------: | **`SGLANG_DISAGGREGATION_THREAD_POOL_SIZE`** | Controls the total number of worker threads for KVCache transfer operations per TP rank | A dynamic value calculated by `int(0.75 * os.cpu_count()) // 8)`, which is limited to be larger than 4 and less than 12 to ensure efficiency and prevent thread race conditions | | **`SGLANG_DISAGGREGATION_QUEUE_SIZE`** | Sets the number of parallel transfer queues. KVCache transfer requests from multiple decode instances will be sharded into these queues so that they can share the threads and the transfer bandwidth at the same time. If it is set to `1`, then we transfer requests one by one according to fcfs strategy | `4` | | **`SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT`** | Timeout (seconds) for receiving destination KV indices during request initialization | `300` | +| **`SGLANG_DISAGGREGATION_BOOTSTRAP_ENTRY_CLEANUP_INTERVAL`** | Interval (seconds) between cleanups of bootstrap entries | `120` | If a greater mean TTFT is acceptable, you can `export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600` (10 minutes) to relax the timeout condition. Please be aware that this setting will cause prefill instances to take a longer time to clean up the affected memory resources when a running decode node loses connection. diff --git a/docs/advanced_features/piecewise_cuda_graph.md b/docs/advanced_features/piecewise_cuda_graph.md new file mode 100644 index 000000000000..e0bb47af94eb --- /dev/null +++ b/docs/advanced_features/piecewise_cuda_graph.md @@ -0,0 +1,189 @@ +# Piecewise CUDA Graph + +## Motivation + +Standard CUDA graphs capture the entire model forward pass as a single graph. This works well for decode (fixed batch size), but not for extend/prefill where the number of tokens varies across iterations. + +Piecewise CUDA Graph (PCG) solves this by splitting the model's computation graph into pieces (roughly one per layer) at "split points" (e.g., MoE dispatch ops). Each piece is captured as a separate CUDA graph for a set of pre-defined token lengths. At runtime, the input is padded to the nearest captured size, and each piece is replayed. This eliminates kernel launch overhead for prefill/extend while still supporting dynamic shapes. + +Recently we **enabled PCG by default**, which means that the old `--enable-piecewise-cuda-graph` flag is deprecated. Use `--disable-piecewise-cuda-graph` to turn it off. + +## Usage + +PCG is enabled by default for supported configurations. No extra flags needed: + +```bash +python3 -m sglang.launch_server \ + --model-path meta-llama/Llama-3.1-8B-Instruct +``` + +### Disable PCG + +```bash +python3 -m sglang.launch_server \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --disable-piecewise-cuda-graph +``` + +### Custom capture sizes + +```bash +python3 -m sglang.launch_server \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --piecewise-cuda-graph-max-tokens 2048 +``` + +### Server Args + +| Argument | Default | Description | +|---|---|---| +| `--disable-piecewise-cuda-graph` | `False` | Disable PCG for extend/prefill. | +| `--enforce-piecewise-cuda-graph` | `False` | Force-enable PCG, skipping all auto-disable conditions. For testing only. | +| `--piecewise-cuda-graph-max-tokens` | `None` (auto) | Maximum token count to capture. Defaults to `chunked_prefill_size` (non-MLA) or `2048` (MLA). | +| `--piecewise-cuda-graph-tokens` | `None` (auto) | Explicit list of token lengths to capture. Auto-generated if not set. | +| `--piecewise-cuda-graph-compiler` | `"eager"` | Compiler backend for the captured subgraphs. Choices: `eager`, `inductor`. | +| ~~`--enable-piecewise-cuda-graph`~~ | — | **Deprecated.** PCG is now enabled by default. Use `--enforce-piecewise-cuda-graph` to skip auto-disable conditions. | + +## Bug Report + +PCG is enabled by default but is still in an experimental stage. Since PCG relies on `torch.compile` to trace the model's forward pass, most bugs are introduced by torch compile tracing failures (e.g., untraceable ops, dynamic control flow, or graph breaks). If you encounter any issues related to PCG, please disable it by adding `--disable-piecewise-cuda-graph` to your launch command and report the bug at [GitHub Issues](https://github.com/sgl-project/sglang/issues/new/choose). We greatly appreciate your help in improving this feature. + +### For Users + +If you see an error message like the following during server startup, it is a PCG bug: + +``` +Piecewise CUDA Graph is enabled by default as an experimental feature. +To work around this error, add --disable-piecewise-cuda-graph to your launch command. +Please report this issue at https://github.com/sgl-project/sglang/issues/new/choose +``` + +To work around it, add `--disable-piecewise-cuda-graph` to your launch command. When filing a bug report, please include: +1. The full error traceback +2. Model name and quantization method +3. Launch command with all arguments +4. GPU type and driver version + +### For Developers + +Since PCG relies on `torch.compile` to trace the model's forward pass, newly developed CUDA kernels (both JIT kernels and sgl-kernels) are typically not compatible with `torch.compile` out of the box. The tracing will fail on untraceable operations such as JIT compilation, file I/O, or dynamic module loading inside the kernel. + +To make a kernel compatible with PCG, you need to register it as a custom op using `register_custom_op` from `sglang.srt.utils.custom_op`. This wraps the kernel as an opaque node in the compiled graph so that `torch.compile` will not trace inside it. + +**Example usage (JIT kernel):** + +```python +from sglang.srt.utils.custom_op import register_custom_op + +# Inplace operator (no return value) +@register_custom_op(mutates_args=["output_q", "output_s"]) +def per_token_group_quant_8bit( + input: torch.Tensor, + output_q: torch.Tensor, + output_s: torch.Tensor, +) -> None: + # kernel implementation ... +``` + +**Example usage (operator with output):** + +```python +# out_shape indicates which argument has the same shape as the output +@register_custom_op(mutates_args=["x"], out_shape=0) +def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + return x.add_(y) +``` + +For wrapping external library functions (e.g., FlashInfer kernels), use `register_custom_op_from_extern` instead. See `python/sglang/srt/utils/custom_op.py` for full API documentation. + +## How it works +### Torch compile backend + +PCG uses `torch.compile` with a custom backend (`SGLangBackend`) to split and compile the model's forward pass. The flow is: + +``` +model.forward wrapper +→ torch.compile(..., backend=SGLangBackend) +→ FX graph +→ split_graph() at registered split ops +→ split_gm (top-level graph that chains the pieces) +→ replace capturable submodules with CUDAPiecewiseBackend +→ runtime dispatch: eager split ops + per-piece capture/replay +``` + +- **Install**: `install_torch_compiled()` replaces `model.forward` with a wrapper function. When `is_in_piecewise_cuda_graph()` returns True, the wrapper dispatches to the compiled callable; otherwise it falls back to the original forward. The first invocation through this path triggers Dynamo tracing and graph compilation — CUDA graph replay only happens after the capture phase completes. + +- **Split**: When `torch.compile` traces the model, `SGLangBackend` receives the FX graph and calls `split_graph()`. Ops listed in `CompilationConfig.split_ops` are treated as split points, so the graph is cut at each one. These split-op submodules are left to run eagerly at runtime, while the surrounding submodules are compiled and wrapped by `CUDAPiecewiseBackend`. The result is a top-level "stitching graph" (`split_gm`) with children such as `submod_0`, `submod_1`, … interleaving capturable subgraphs and eager split-op submodules. + +- **Replace**: `PiecewiseCompileInterpreter` iterates over each capturable submodule in `split_gm`, compiles it for general (dynamic) shapes, and replaces it in-place with a `CUDAPiecewiseBackend` instance. Split-op submodules (e.g., attention, all-reduce) are left as-is and run eagerly at runtime. + +- **Dispatch**: At runtime, calling `split_gm` executes the stitching graph, which calls each submodule in order. Split-op submodules run eagerly. Each `CUDAPiecewiseBackend` submodule goes through three phases: + - **Compile warmup** — runs the general-shape compiled path. + - **Capture** — for each capture size, runs one warmup pass then records a CUDA graph. + - **Steady-state replay** — replays the captured CUDA graph for each forward pass. + +### Piecewise cuda graph runner + +`PiecewiseCudaGraphRunner` orchestrates the full lifecycle through three phases: + +- **Compile** — Warms up JIT kernels with a dummy forward pass, then wraps the model with `torch.compile`, triggering Dynamo tracing to split the FX graph and create `CUDAPiecewiseBackend` instances for each subgraph piece. + +- **Capture** — Iterates over capture sizes in reverse order (largest first). For each size, runs the forward pass twice (one warmup, one CUDA graph capture). + +- **Replay** — At runtime, finds the smallest captured size >= actual token count via binary search, copies inputs into static buffers with zero-padding, replays the captured CUDA graphs, and slices outputs back to the actual token count. + +### Memory optimization + +The memory cost of PCG comes from two parts: **torch memory allocator** and **non-torch memory**. + +The torch memory allocator overhead is trivial thanks to several optimizations: a global shared memory pool is reused across all CUDA graph runners and capture sizes, capture is done in reverse order (large to small) so smaller graphs reuse memory allocated by larger ones, and output tensors of the last subgraph are stored as weak references to maximize memory reuse. + +The main memory overhead comes from non-torch memory — the CUDA graph objects themselves require GPU memory to store the recorded kernel launch parameters and internal state. This overhead scales with the number of captured sizes, which is why `piecewise_cuda_graph_max_tokens` is capped conservatively by default. + +### Shape configuration +Piecewise CUDA graph pre-captures graphs for a set of token counts. At runtime, the actual token count is rounded up to the nearest captured size (via binary search), and the corresponding graph is replayed. If the token count exceeds the largest captured size, the runtime falls back to the normal (non-graph) forward path. + +The default capture schedule is auto-generated with increasing granularity: + +| Token range | Step size | +|-------------|-----------| +| 4 – 32 | 4 | +| 48 – 256 | 16 | +| 288 – 512 | 32 | +| 576 – 1024 | 64 | +| 1280 – 4096 | 256 | +| 4096+ | 512 | + +For the auto-generated schedule, sizes are capped at `--piecewise-cuda-graph-max-tokens`. The default cap is `chunked_prefill_size` for non-MLA models and `2048` for MLA backend models. If `--max-total-tokens` is set, the cap is further limited to not exceed it. Additionally, Llama-2 models are auto-capped at 4096 tokens as a temporary workaround. + +## Compatibility + +PCG is auto-disabled in the following scenarios. We are actively working on expanding compatibility — support for many of these will be coming soon. + +- Disabled model architectures (e.g., `DeepseekV32ForCausalLM`) +- Speculative decoding +- DP attention +- Pipeline parallelism (`pp_size > 1`) +- Non-CUDA hardware (AMD ROCm, Ascend NPU) +- MoE A2A backend +- LoRA +- Multimodal / VLM models +- DLLM (diffusion LLM) +- Deterministic inference +- PD disaggregation +- Expert distribution recorder / EPLB + +Use `--enforce-piecewise-cuda-graph` to skip all auto-disable checks (for testing/debugging only). + +## Code Reference + +| File | Description | +|---|---| +| `python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py` | Main runner: init, capture, replay | +| `python/sglang/srt/compilation/compile.py` | `install_torch_compiled` trampoline | +| `python/sglang/srt/compilation/backend.py` | `SGLangBackend`, graph splitting, piecewise compilation | +| `python/sglang/srt/compilation/cuda_piecewise_backend.py` | Per-subgraph CUDA graph capture/replay | +| `python/sglang/srt/compilation/piecewise_context_manager.py` | Global context flags and `ForwardContext` | +| `python/sglang/srt/compilation/compilation_config.py` | Capture sizes, split ops, compiler config | +| `python/sglang/srt/utils/custom_op.py` | `register_custom_op` for torch.compile compatibility | +| `python/sglang/srt/server_args.py` | Server arguments and auto-disable logic | diff --git a/docs/advanced_features/quantization.md b/docs/advanced_features/quantization.md index 90715a908ea7..ce14bbaf2bbc 100644 --- a/docs/advanced_features/quantization.md +++ b/docs/advanced_features/quantization.md @@ -17,6 +17,34 @@ or [NeuralMagic](https://huggingface.co/collections/neuralmagic) collections on popular quality validated quantized models. Quantized models must be validated via benchmarks post-quantization to guard against abnormal quantization loss regressions. +## Platform Compatibility + +The following table summarizes quantization method support across NVIDIA and AMD GPUs. + +| Method | NVIDIA GPUs | AMD GPUs (MI300X/MI325X/MI350X) | Notes | +|--------|:-----------:|:-------------------------------:|-------| +| `fp8` | Yes | Yes | Aiter or Triton backend on AMD | +| `mxfp4` | Yes | Yes | Requires CDNA3/CDNA4 with MXFP support; uses Aiter | +| `blockwise_int8` | Yes | Yes | Triton-based, works on both platforms | +| `w8a8_int8` | Yes | Yes | | +| `w8a8_fp8` | Yes | Yes | Aiter or Triton FP8 on AMD | +| `awq` | Yes | Yes | Uses Triton dequantize on AMD (vs. optimized CUDA kernels on NVIDIA) | +| `gptq` | Yes | Yes | Uses Triton or vLLM kernels on AMD | +| `compressed-tensors` | Yes | Yes | Aiter paths for FP8/MoE on AMD | +| `quark` | Yes | Yes | AMD Quark quantization; Aiter GEMM paths on AMD | +| `auto-round` | Yes | Yes | Platform-agnostic (Intel auto-round) | +| `quark_int4fp8_moe` | No | Yes | AMD-only; online INT4-to-FP8 MoE quantization (CDNA3/CDNA4) | +| `awq_marlin` | Yes | No | Marlin kernels are CUDA-only | +| `gptq_marlin` | Yes | No | Marlin kernels are CUDA-only | +| `gguf` | Yes | No | CUDA-only kernels in sgl-kernel | +| `modelopt` / `modelopt_fp8` | Yes | No | NVIDIA ModelOpt, requires NVIDIA hardware | +| `modelopt_fp4` | Yes (Blackwell) | No | NVIDIA Blackwell only | +| `petit_nvfp4` | Yes (Blackwell) | No | NVIDIA NvFP4, Blackwell only | +| `bitsandbytes` | Yes | Experimental | Depends on bitsandbytes ROCm support | +| `torchao` (`int4wo`, etc.) | Yes | Partial | `int4wo` not supported on AMD; other methods may work | + +On AMD, several of these methods use [Aiter](https://github.com/ROCm/aiter) for acceleration -- set `SGLANG_USE_AITER=1` where noted. See [AMD GPU setup](../platforms/amd_gpu.md) for installation and configuration details. + ## Offline Quantization To load already quantized models, simply load the model weights and config. **Again, if the model has been quantized offline, @@ -191,18 +219,81 @@ python3 -m sglang.launch_server \ #### Using [NVIDIA ModelOpt](https://github.com/NVIDIA/Model-Optimizer) -NVIDIA Model Optimizer (ModelOpt) provides advanced quantization techniques optimized for NVIDIA hardware. SGLang includes a streamlined workflow for quantizing models with ModelOpt and automatically exporting them for deployment. +NVIDIA Model Optimizer (ModelOpt) provides advanced quantization techniques optimized for NVIDIA hardware. + +**Offline vs. Online Quantization:** + +SGLang supports two modes for ModelOpt. + +* **Offline Quantization (pre-quantized):** + * **Usage:** Download a pre-quantized model from Hugging Face or run `hf_ptq.py` once to create a new quantized checkpoint. Then load this quantized checkpoint. + * **Pros:** Fast server startup, quantization can be validated before deployment, efficient resource usage. + * **Cons:** Requires an extra preparation step. + +* **Online Quantization (quant and serve):** + * **Usage:** Load a standard BF16/FP16 model and add a flag. The engine applies quantization *on startup*. + * **Pros:** Convenient (no new checkpoint needed). + * **Cons:** **High startup time**, increases VRAM usage during initialization (risk of OOM). + +The following sections guide you through using the Offline path: loading pre-quantized models or creating your own checkpoints. + +##### Using Pre-Quantized Checkpoints + +If a model is already quantized (e.g., from Hugging Face), you can load it directly. + +* **FP8 Models:** + Use `--quantization modelopt_fp8`. + ```bash + python3 -m sglang.launch_server \ + --model-path nvidia/Llama-3.1-8B-Instruct-FP8 \ + --quantization modelopt_fp8 \ + --port 30000 + ``` + +* **FP4 Models:** + Use `--quantization modelopt_fp4`. + ```bash + python3 -m sglang.launch_server \ + --model-path nvidia/Llama-3.3-70B-Instruct-NVFP4 \ + --quantization modelopt_fp4 \ + --port 30000 + ``` + +##### Creating Your Own Quantized Checkpoints + +If a pre-quantized checkpoint is not available for your model, you can create one using NVIDIA Model Optimizer's `hf_ptq.py` script. + +**Why quantize?** +- Reduce VRAM usage +- Higher throughput and lower latency +- More flexible deployment (on smaller GPUs) + +**What can be quantized?** +- The entire model +- MLP layers only +- KV cache + +**Key options in `hf_ptq.py`:** + +`--qformat`: Quantization formats `fp8`, `nvfp4`, `nvfp4_mlp_only` + +`--kv_cache_qformat`: KV cache quantization format (default: `fp8`) + +**Note:** The default `kv_cache_qformat` may not be optimal for all use cases. Consider setting this explicitly. + +**Hardware requirements:** Hopper and higher are recommended. Insufficient GPU memory may cause weight offloading, resulting in extremely long quantization time. + +For detailed usage and supported model architectures, see [NVIDIA Model Optimizer LLM PTQ](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_ptq). + +SGLang includes a streamlined workflow for quantizing models with ModelOpt and automatically exporting them for deployment. + ##### Installation -First, install ModelOpt. You can either install it directly or as an optional SGLang dependency: +First, install ModelOpt: ```bash -# Option 1: Install ModelOpt directly pip install nvidia-modelopt - -# Option 2: Install SGLang with ModelOpt support (recommended) -pip install sglang[modelopt] ``` ##### Quantization and Export Workflow @@ -277,20 +368,33 @@ Or using the Python API: ```python import sglang as sgl -# Deploy exported ModelOpt quantized model -llm = sgl.Engine( - model_path="./quantized_tinyllama_fp8", - quantization="modelopt" -) - -# Run inference -prompts = ["Hello, how are you?", "What is the capital of France?"] -sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 100} -outputs = llm.generate(prompts, sampling_params) +def main(): + # Deploy exported ModelOpt quantized model + llm = sgl.Engine( + model_path="./quantized_tinyllama_fp8", + quantization="modelopt", + ) + + # Run inference + prompts = [ + "Hello, how are you?", + "What is the capital of France?", + ] + sampling_params = { + "temperature": 0.8, + "top_p": 0.95, + "max_new_tokens": 100, + } + + outputs = llm.generate(prompts, sampling_params) + + for i, output in enumerate(outputs): + print(f"Prompt: {prompts[i]}") + print(f"Output: {output['text']}") + +if __name__ == "__main__": + main() -for i, output in enumerate(outputs): - print(f"Prompt: {prompts[i]}") - print(f"Output: {output.outputs[0].text}") ``` ##### Advanced Features diff --git a/docs/advanced_features/separate_reasoning.ipynb b/docs/advanced_features/separate_reasoning.ipynb index fde97d8a6a2c..6277dd8bd4bc 100644 --- a/docs/advanced_features/separate_reasoning.ipynb +++ b/docs/advanced_features/separate_reasoning.ipynb @@ -70,7 +70,7 @@ " \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n", ")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=server_process)" ] }, { diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md index 0f1a327c9f11..b8d89c208e07 100644 --- a/docs/advanced_features/server_arguments.md +++ b/docs/advanced_features/server_arguments.md @@ -153,6 +153,8 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--device` | The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified. | `None` | Type: str | | `--tensor-parallel-size`
`--tp-size` | The tensor parallelism size. | `1` | Type: int | | `--pipeline-parallel-size`
`--pp-size` | The pipeline parallelism size. | `1` | Type: int | +| `--attention-context-parallel-size`
`--attn-cp-size`| The attention context parallelism size. | `1` | Type: int| +| `--moe-data-parallel-size`
`--moe-dp-size`| The moe data parallelism size. | `1` | Type: int| | `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | `None` | Type: int | | `--pp-async-batch-depth` | The async batch depth of pipeline parallelism. | `0` | Type: int | | `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher | `1` | Type: int | @@ -212,9 +214,9 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--admin-api-key` | Set **admin API key** for administrative/control endpoints (e.g., weights update, cache flush, `/get_server_info`). Endpoints marked as admin-only require `Authorization: Bearer ` when this is set. | `None` | Type: str | | `--served-model-name` | Override the model name returned by the v1/models endpoint in OpenAI API server. | `None` | Type: str | | `--weight-version` | Version identifier for the model weights. Defaults to 'default' if not specified. | `default` | Type: str | -| `--chat-template` | The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server. | `None` | Type: str | +| `--chat-template` | The builtin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server. | `None` | Type: str | | `--hf-chat-template-name` | When the HuggingFace tokenizer has multiple chat templates (e.g., 'default', 'tool_use', 'rag'), specify which named template to use. If not set, the first available template is used. | `None` | Type: str | -| `--completion-template` | The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently. | `None` | Type: str | +| `--completion-template` | The builtin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently. | `None` | Type: str | | `--file-storage-path` | The path of the file storage in backend. | `sglang_storage` | Type: str | | `--enable-cache-report` | Return number of cached tokens in usage.prompt_tokens_details for each openai request. | `False` | bool flag (set to enable) | | `--reasoning-parser` | Specify the parser for reasoning models. Supported parsers: [deepseek-r1, deepseek-v3, glm45, gpt-oss, kimi, qwen3, qwen3-thinking, step3]. | `None` | `deepseek-r1`, `deepseek-v3`, `glm45`, `gpt-oss`, `kimi`, `qwen3`, `qwen3-thinking`, `step3` | @@ -264,10 +266,10 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--sampling-backend` | Choose the kernels for sampling layers. | `None` | `flashinfer`, `pytorch`, `ascend` | | `--grammar-backend` | Choose the backend for grammar-guided decoding. | `None` | `xgrammar`, `outlines`, `llguidance`, `none` | | `--mm-attention-backend` | Set multimodal attention backend. | `None` | `sdpa`, `fa3`, `fa4`, `triton_attn`, `ascend_attn`, `aiter_attn` | -| `--nsa-prefill-backend` | Choose the NSA backend for the prefill stage (overrides `--attention-backend` when running DeepSeek NSA-style attention). | `flashmla_sparse` | `flashmla_sparse`, `flashmla_kv`, `flashmla_auto`, `fa3`, `tilelang`, `aiter` | -| `--nsa-decode-backend` | Choose the NSA backend for the decode stage when running DeepSeek NSA-style attention. Overrides `--attention-backend` for decoding. | `fa3` | `flashmla_sparse`, `flashmla_kv`, `fa3`, `tilelang`, `aiter` | -| `--fp8-gemm-backend` | Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (optimal for Blackwell and low-latency), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only). **NOTE**: This replaces the deprecated environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. | `auto` | `auto`, `deep_gemm`, `flashinfer_trtllm`, `cutlass`, `triton`, `aiter` | -| `--fp4-gemm-backend` | Choose the runner backend for NVFP4 GEMM operations. Options: 'auto' (default, auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_cutlass' (FlashInfer CUTLASS backend, optimal on CUDA 12), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback. **NOTE**: This replaces the deprecated environment variable SGLANG_FLASHINFER_FP4_GEMM_BACKEND. | `auto` | `auto`, `flashinfer_cudnn`, `flashinfer_cutlass`, `flashinfer_trtllm` | +| `--nsa-prefill-backend` | Choose the NSA backend for the prefill stage (overrides `--attention-backend` when running DeepSeek NSA-style attention). | `flashmla_sparse` | `flashmla_sparse`, `flashmla_kv`, `flashmla_auto`, `fa3`, `tilelang`, `aiter`, `trtllm` | +| `--nsa-decode-backend` | Choose the NSA backend for the decode stage when running DeepSeek NSA-style attention. Overrides `--attention-backend` for decoding. | `fa3` | `flashmla_sparse`, `flashmla_kv`, `fa3`, `tilelang`, `aiter`, `trtllm` | +| `--fp8-gemm-backend` | Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (FlashInfer TRTLLM backend; SM100/SM103 only), 'flashinfer_cutlass' (FlashInfer CUTLASS backend, SM120 only), 'flashinfer_deepgemm' (Hopper SM90 only, uses swapAB optimization for small M dimensions in decoding), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only). **NOTE**: This replaces the deprecated environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. | `auto` | `auto`, `deep_gemm`, `flashinfer_trtllm`, `flashinfer_cutlass`, `flashinfer_deepgemm`, `cutlass`, `triton`, `aiter` | +| `--fp4-gemm-backend` | Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback. **NOTE**: This replaces the deprecated environment variable SGLANG_FLASHINFER_FP4_GEMM_BACKEND. | `flashinfer_cutlass` | `auto`, `flashinfer_cudnn`, `flashinfer_cutlass`, `flashinfer_trtllm` | | `--disable-flashinfer-autotune` | Flashinfer autotune is enabled by default. Set this flag to disable the autotune. | `False` | bool flag (set to enable) | ## Speculative decoding @@ -309,10 +311,11 @@ Please consult the documentation below and [server_args.py](https://github.com/s | Argument | Description | Defaults | Options | | --- | --- | --- | --- | | `--expert-parallel-size`
`--ep-size`
`--ep` | The expert parallelism size. | `1` | Type: int | -| `--moe-a2a-backend` | Select the backend for all-to-all communication for expert parallelism. | `none` | `none`, `deepep`, `mooncake`, `ascend_fuseep`| +| `--moe-a2a-backend` | Select the backend for all-to-all communication for expert parallelism. | `none` | `none`, `deepep`, `mooncake`, `mori`, `ascend_fuseep`| | `--moe-runner-backend` | Choose the runner backend for MoE. | `auto` | `auto`, `deep_gemm`, `triton`, `triton_kernel`, `flashinfer_trtllm`, `flashinfer_cutlass`, `flashinfer_mxfp4`, `flashinfer_cutedsl`, `cutlass` | | `--flashinfer-mxfp4-moe-precision` | Choose the computation precision of flashinfer mxfp4 moe | `default` | `default`, `bf16` | | `--enable-flashinfer-allreduce-fusion` | Enable FlashInfer allreduce fusion with Residual RMSNorm. | `False` | bool flag (set to enable) | +| `--enable-aiter-allreduce-fusion` | Enable aiter allreduce fusion with Residual RMSNorm. | `False` | bool flag (set to enable) | | `--deepep-mode` | Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch. | `auto` | `normal`, `low_latency`, `auto` | | `--ep-num-redundant-experts` | Allocate this number of redundant experts in expert parallel. | `0` | Type: int | | `--ep-dispatch-algorithm` | The algorithm to choose ranks for redundant experts in expert parallel. | `None` | Type: str | @@ -328,13 +331,14 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--deepep-config` | Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path. | `None` | Type: str | | `--moe-dense-tp-size` | TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports. | `None` | Type: int | | `--elastic-ep-backend` | Specify the collective communication backend for elastic EP. Currently supports 'mooncake'. | `none` | `none`, `mooncake` | +| `--enable-elastic-expert-backup` | Enable elastic EP backend to backup expert weights in DRAM feature. Currently supports 'mooncake'.| `False` | bool flag (set to enable) | | `--mooncake-ib-device` | The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices (e.g., --mooncake-ib-device mlx5_0,mlx5_1). Default is None, which triggers automatic device detection when Mooncake Backend is enabled. | `None` | Type: str | ## Mamba Cache | Argument | Description | Defaults | Options | | --- | --- | --- | --- | | `--max-mamba-cache-size` | The maximum size of the mamba cache. | `None` | Type: int | -| `--mamba-ssm-dtype` | The data type of the SSM states in mamba cache. | `float32` | `float32`, `bfloat16` | +| `--mamba-ssm-dtype` | The data type of the SSM states in mamba cache. | `float32` | `float32`, `bfloat16`, `float16` | | `--mamba-full-memory-ratio` | The ratio of mamba state memory to full kv cache memory. | `0.9` | Type: float | | `--mamba-scheduler-strategy` | The strategy to use for mamba scheduler. `auto` currently defaults to `no_buffer`. 1. `no_buffer` does not support overlap scheduler due to not allocating extra mamba state buffers. Branching point caching support is feasible but not implemented. 2. `extra_buffer` supports overlap schedule by allocating extra mamba state buffers to track mamba state for caching (mamba state usage per running req becomes `2x` for non-spec; `1+(1/(2+speculative_num_draft_tokens))x` for spec dec (e.g. 1.16x if speculative_num_draft_tokens==4)). 2a. `extra_buffer` is strictly better for non-KV-cache-bound cases; for KV-cache-bound cases, the tradeoff depends on whether enabling overlap outweighs reduced max running requests. 2b. mamba caching at radix cache branching point is strictly better than non-branch but requires kernel support (currently only FLA backend), currently only extra_buffer supports branching. | `auto` | `auto`, `no_buffer`, `extra_buffer` | | `--mamba-track-interval` | The interval (in tokens) to track the mamba state during decode. Only used when `--mamba-scheduler-strategy` is `extra_buffer`. Must be divisible by page_size if set, and must be >= speculative_num_draft_tokens when using speculative decoding. | `256` | Type: int | @@ -373,6 +377,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--kt-max-deferred-experts-per-token` | [ktransformers parameter] Maximum number of experts deferred to CPU per token. All MoE layers except the final one use this value; the final layer always uses 0. | `None` | Type: int | ## Diffusion LLM + | Argument | Description | Defaults | Options | | --- | --- | --- | --- | | `--dllm-algorithm` | The diffusion LLM algorithm, such as LowConfidence. | `None` | Type: str | @@ -431,7 +436,8 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--tbo-token-distribution-threshold` | The threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap. | `0.48` | Type: float | | `--enable-torch-compile` | Optimize the model with torch.compile. Experimental feature. | `False` | bool flag (set to enable) | | `--enable-torch-compile-debug-mode` | Enable debug mode for torch compile. | `False` | bool flag (set to enable) | -| `--enable-piecewise-cuda-graph` | Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature. | `False` | bool flag (set to enable) | +| `--disable-piecewise-cuda-graph` | Disable piecewise cuda graph for extend/prefill. PCG is enabled by default. | `False` | bool flag (set to disable) | +| `--enforce-piecewise-cuda-graph` | Enforce piecewise cuda graph, skipping all auto-disable conditions. For testing only. | `False` | bool flag (set to enable) | | `--piecewise-cuda-graph-tokens` | Set the list of tokens when using piecewise cuda graph. | `None` | Type: JSON list | | `--piecewise-cuda-graph-compiler` | Set the compiler for piecewise cuda graph. Choices are: eager, inductor. | `eager` | `eager`, `inductor` | | `--torch-compile-max-bs` | Set the maximum batch size when using torch compile. | `32` | Type: int | @@ -462,7 +468,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--rl-on-policy-target` | The training system that SGLang needs to match for true on-policy. | `None` | `fsdp` | | `--enable-attn-tp-input-scattered` | Allow input of attention to be scattered when only using tensor parallelism, to reduce the computational load of operations such as qkv latent. | `False` | bool flag (set to enable) | | `--enable-nsa-prefill-context-parallel` | Enable context parallelism used in the long sequence prefill phase of DeepSeek v3.2. | `False` | bool flag (set to enable) | -| `--nsa-prefill-cp-mode` | Token splitting mode for the prefill phase of DeepSeek v3.2 under context parallelism. Optional values: `in-seq-split` (default), `round-robin-split`. `round-robin-split` distributes tokens across ranks based on `token_idx % cp_size`. It supports multi-batch prefill, fused MoE, and FP8 KV cache. | `in-seq-split` | `in-seq-split`, `round-robin-split` | +| `--nsa-prefill-cp-mode` | Token splitting mode for the prefill phase of DeepSeek v3.2 under context parallelism. Optional values: `round-robin-split`(default),`in-seq-split`. `round-robin-split` distributes tokens across ranks based on `token_idx % cp_size`. It supports multi-batch prefill, fused MoE, and FP8 KV cache. | `in-seq-split` | `in-seq-split`, `round-robin-split` | | `--enable-fused-qk-norm-rope` | Enable fused qk normalization and rope rotary embedding. | `False` | bool flag (set to enable) | | `--enable-precise-embedding-interpolation` | Enable corner alignment for resize of embeddings grid to ensure more accurate(but slower) evaluation of interpolated embedding values. | `False` | bool flag (set to enable) | @@ -487,12 +493,8 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--disaggregation-mode` | Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated | `null` | `null`, `prefill`, `decode` | | `--disaggregation-transfer-backend` | The backend for disaggregation transfer. Default is mooncake. | `mooncake` | `mooncake`, `nixl`, `ascend`, `fake` | | `--disaggregation-bootstrap-port` | Bootstrap server port on the prefill server. Default is 8998. | `8998` | Type: int | -| `--disaggregation-decode-tp` | Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server. | `None` | Type: int | -| `--disaggregation-decode-dp` | Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server. | `None` | Type: int | -| `--disaggregation-prefill-pp` | Prefill pp size. If not set, it is default to 1. This is only set on the decode server. | `1` | Type: int | | `--disaggregation-ib-device` | The InfiniBand devices for disaggregation transfer, accepts single device (e.g., --disaggregation-ib-device mlx5_0) or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). Default is None, which triggers automatic device detection when mooncake backend is enabled. | `None` | Type: str | | `--disaggregation-decode-enable-offload-kvcache` | Enable async KV cache offloading on decode server (PD mode). | `False` | bool flag (set to enable) | -| `--disaggregation-decode-enable-fake-auto` | Auto enable FAKE mode for decode node testing, no need to pass bootstrap_host and bootstrap_room in request. | `False` | bool flag (set to enable) | | `--num-reserved-decode-tokens` | Number of decode tokens that will have memory reserved when adding new request to the running batch. | `512` | Type: int | | `--disaggregation-decode-polling-interval` | The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this. | `1` | Type: int | diff --git a/docs/advanced_features/sglang_for_rl.md b/docs/advanced_features/sglang_for_rl.md index 2fd84c90de69..12eb41540339 100644 --- a/docs/advanced_features/sglang_for_rl.md +++ b/docs/advanced_features/sglang_for_rl.md @@ -106,6 +106,29 @@ This path trades some I/O overhead for simplicity and flexibility. It integrates **Python Engine API:** `engine.update_weights_from_disk(model_path, load_format=None)` +**Diffusion engine (SGLang-Diffusion):** The diffusion engine exposes the same `POST /update_weights_from_disk` endpoint with the following behavior: + +- **All-or-nothing with rollback:** if any module fails to load, all previously updated modules are rolled back to the original weights by reloading from the original model path. No partial updates are left behind. If rollback itself fails, the exception propagates so the caller knows the model is in an inconsistent state. +- **Offload-aware:** when layerwise offload (`--dit-layerwise-offload`) is enabled, the diffusion offload manager replaces GPU parameters with small `torch.empty((1,))` placeholders while real weights live in consolidated pinned CPU buffers. A naive `param.data.copy_()` would fail with a shape mismatch. Instead, the updater dynamically detects active offload managers and writes new weights directly into their CPU buffers, bypassing the placeholders entirely. For any layer that happens to be prefetched on GPU at update time, the live GPU tensor is also updated so the change takes effect immediately. This requires no extra GPU memory and does not disturb the offload state. +- **DTensor-aware:** parameters distributed via `torch.distributed.tensor` (tensor parallelism) are updated through `distribute_tensor` so that each shard is correctly placed on the right device mesh. + +**Request body:** + +| Field | Description | Defaults | Options | +| --- | --- | --- | --- | +| `model_path` | The model path with the new weights. | Required | Type: str | +| `flush_cache` | Flush TeaCache state after update. | `True` | Type: bool | +| `target_modules` | List of module names to update (e.g. `["transformer"]`). If omitted, all `nn.Module` components are updated. | `None` | Type: list[str] | + +**Response body:** + +| Field | Description | Defaults | Options | +| --- | --- | --- | --- | +| `success` | Whether the update succeeded. | - | Type: bool | +| `message` | Status / error message. | - | Type: str | + +> **Note:** The diffusion engine (SGLang-Diffusion) does not currently support hot refit (updating weights while inference is in progress). The diffusion scheduler processes one request at a time and completes the entire inference before handling the next request, so weight updates and inference never run concurrently. + ### Update Weights from Tensor **When to use:** diff --git a/docs/advanced_features/speculative_decoding.ipynb b/docs/advanced_features/speculative_decoding.ipynb deleted file mode 100644 index aa62b897a8b6..000000000000 --- a/docs/advanced_features/speculative_decoding.ipynb +++ /dev/null @@ -1,370 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Speculative Decoding\n", - "\n", - "SGLang now provides an EAGLE-based (EAGLE-2/EAGLE-3) speculative decoding option. Our implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n", - "\n", - "### Performance Highlights\n", - "\n", - "Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding.\n", - "For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840).\n", - "\n", - "| Method | Throughput (tokens/s) |\n", - "|--------|----------------|\n", - "| SGLang (w/o speculative, 1x H100) | 158.34 tokens/s |\n", - "| SGLang + EAGLE-2 (1x H100) | 244.10 tokens/s |\n", - "| SGLang + EAGLE-3 (1x H100) | 373.25 tokens/s |" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## EAGLE Decoding\n", - "\n", - "To enable EAGLE speculative decoding the following parameters are relevant:\n", - "* `speculative_draft_model_path`: Specifies draft model. This parameter is required.\n", - "* `speculative_num_steps`: Depth of autoregressive drafting. Increases speculation range but risks rejection cascades. Default is 5.\n", - "* `speculative_eagle_topk`: Branching factor per step. Improves candidate diversity, will lead to higher acceptance rate, but more lead to higher memory/compute consumption. Default is 4.\n", - "* `speculative_num_draft_tokens`: Maximum parallel verification capacity. Allows deeper tree evaluation but will lead to higher GPU memory usage. Default is 8.\n", - "\n", - "These parameters are the same for EAGLE-2 and EAGLE-3.\n", - "\n", - "You can find the best combinations of these parameters with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py).\n", - "\n", - "In the documentation below, we set `--cuda-graph-max-bs` to be a small value for faster engine startup. For your own workloads, please tune the above parameters together with `--cuda-graph-max-bs`, `--max-running-requests`, `--mem-fraction-static` for the best performance. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### EAGLE-2 decoding\n", - "\n", - "You can enable EAGLE-2 decoding by setting `--speculative-algorithm EAGLE` and choosing an appropriate model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sglang.test.doc_patch import launch_server_cmd\n", - "from sglang.utils import wait_for_server, print_highlight, terminate_process\n", - "\n", - "import openai" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", - "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n", - " --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 \\\n", - " --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --log-level warning\n", - "\"\"\"\n", - ")\n", - "\n", - "wait_for_server(f\"http://localhost:{port}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", - "\n", - "response = client.chat.completions.create(\n", - " model=\"meta-llama/Llama-2-7b-chat-hf\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", - " ],\n", - " temperature=0,\n", - " max_tokens=64,\n", - ")\n", - "\n", - "print_highlight(f\"Response: {response}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "terminate_process(server_process)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### EAGLE-2 Decoding with `torch.compile`\n", - "\n", - "You can also enable `torch.compile` for further optimizations and optionally set `--torch-compile-max-bs`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", - "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n", - " --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n", - " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n", - " --enable-torch-compile --torch-compile-max-bs 2 --log-level warning\n", - "\"\"\"\n", - ")\n", - "\n", - "wait_for_server(f\"http://localhost:{port}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", - "\n", - "response = client.chat.completions.create(\n", - " model=\"meta-llama/Llama-2-7b-chat-hf\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", - " ],\n", - " temperature=0,\n", - " max_tokens=64,\n", - ")\n", - "\n", - "print_highlight(f\"Response: {response}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "terminate_process(server_process)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### EAGLE-2 Decoding via Frequency-Ranked Speculative Sampling\n", - "\n", - "By employing a truncated high-frequency token vocabulary in the draft model, Eagle speculative decoding reduces `lm_head` computational overhead while accelerating the pipeline without quality degradation. For more details, checkout [the paper](https://arxiv.org/pdf/arXiv:2502.14856).\n", - "\n", - "In our implementation, set `--speculative-token-map` to enable the optimization. You can get the high-frequency token in FR-Spec from [this model](https://huggingface.co/thunlp/LLaMA3-Instruct-8B-FR-Spec). Or you can obtain high-frequency token by directly downloading these token from [this repo](https://github.com/thunlp/FR-Spec/tree/main?tab=readme-ov-file#prepare-fr-spec-vocabulary-subset).\n", - "\n", - "Thanks for the contribution from [Weilin Zhao](https://github.com/Achazwl) and [Zhousx](https://github.com/Zhou-sx). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", - "python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n", - " --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n", - " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n", - " --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n", - "\"\"\"\n", - ")\n", - "\n", - "wait_for_server(f\"http://localhost:{port}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", - "\n", - "response = client.chat.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", - " ],\n", - " temperature=0,\n", - " max_tokens=64,\n", - ")\n", - "\n", - "print_highlight(f\"Response: {response}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "terminate_process(server_process)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### EAGLE-3 Decoding\n", - "\n", - "You can enable EAGLE-3 decoding by setting `--speculative-algorithm EAGLE3` and choosing an appropriate model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", - "python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --speculative-algorithm EAGLE3 \\\n", - " --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B --speculative-num-steps 5 \\\n", - " --speculative-eagle-topk 8 --speculative-num-draft-tokens 32 --mem-fraction 0.6 \\\n", - " --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n", - "\"\"\"\n", - ")\n", - "\n", - "wait_for_server(f\"http://localhost:{port}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", - "\n", - "response = client.chat.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", - " ],\n", - " temperature=0,\n", - " max_tokens=64,\n", - ")\n", - "\n", - "print_highlight(f\"Response: {response}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "terminate_process(server_process)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Multi Token Prediction\n", - "\n", - "We support [MTP(Multi-Token Prediction)](https://arxiv.org/pdf/2404.19737) in SGLang by using speculative decoding. We use Xiaomi/MiMo-7B-RL model as example here (deepseek mtp usage refer to [deepseek doc](../basic_usage/deepseek.md#multi-token-prediction))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "server_process, port = launch_server_cmd(\n", - " \"\"\"\n", - " python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n", - " --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n", - " --mem-fraction 0.5 --log-level warning\n", - "\"\"\"\n", - ")\n", - "\n", - "wait_for_server(f\"http://localhost:{port}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "\n", - "url = f\"http://localhost:{port}/v1/chat/completions\"\n", - "\n", - "data = {\n", - " \"model\": \"XiaomiMiMo/MiMo-7B-RL\",\n", - " \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n", - "}\n", - "\n", - "response = requests.post(url, json=data)\n", - "print_highlight(response.json())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "terminate_process(server_process)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## References\n", - "\n", - "EAGLE process is as follows:\n", - "\n", - "- Within EAGLE the draft model predicts the next feature vector, i.e. the last hidden state of the original LLM, using the feature sequence $(f_1, ..., f_k)$ and the token sequence $(t_2, ..., t_{k+1})$. \n", - "- The next token is then sampled from $p_{k+2}=\\text{LMHead}(f_{k+1})$. Afterwards, the two sequences are extended in a tree style—branching out multiple potential continuations, with the branching factor per step controlled by the `speculative_eagle_topk` parameter—to ensure a more coherent connection of context, and are given as input again.\n", - "- EAGLE-2 additionally uses the draft model to evaluate how probable certain branches in the draft tree are, dynamically stopping the expansion of unlikely branches. After the expansion phase, reranking is employed to select only the top `speculative_num_draft_tokens` final nodes as draft tokens.\n", - "- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner.\n", - "\n", - "This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionally to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n", - "\n", - "\n", - "For guidance how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train)." - ] - } - ], - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/advanced_features/speculative_decoding.md b/docs/advanced_features/speculative_decoding.md new file mode 100644 index 000000000000..121ae8d586b7 --- /dev/null +++ b/docs/advanced_features/speculative_decoding.md @@ -0,0 +1,568 @@ +# Speculative Decoding + +SGLang provides several speculative decoding options, including EAGLE-2/EAGLE-3, MTP, classic draft-model decoding, and an NGRAM-based variant. Our implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines. + +## Summary + +### Jump to sections + +- [EAGLE Decoding](#eagle-decoding) + - [EAGLE-2 Decoding](#eagle-2-decoding) + - [EAGLE-2 Decoding with torch.compile](#eagle-2-decoding-with-torchcompile) + - [EAGLE-2 Decoding via Frequency-Ranked Speculative Sampling](#eagle-2-decoding-via-frequency-ranked-speculative-sampling) + - [EAGLE-3 Decoding](#eagle-3-decoding) +- [Multi Token Prediction](#multi-token-prediction) +- [Standalone Speculative Decoding (Small Draft Model)](#standalone-speculative-decoding-small-draft-model) +- [Speculative Decoding V2 (Overlap Scheduler)](#speculative-decoding-v2-overlap-scheduler) +- [Ngram Speculative Decoding](#ngram-speculative-decoding) +- [Full Parameter Reference](#full-parameter-reference) +- [OOM Troubleshooting](#oom-troubleshooting) +- [References](#references) + +### Quick guidance + +- **Best speed/quality (recommended)**: Use **EAGLE-3** with `--speculative-algorithm EAGLE3`. +- **Strong default / broad compatibility**: Use **EAGLE-2** with `--speculative-algorithm EAGLE`. +- **Lower `lm_head` overhead for EAGLE-2**: Enable **FR-Spec** with `--speculative-token-map`. +- **Model is MTP-enabled**: Use **MTP via speculative decoding** (often with small `speculative_num_steps/topk/num_draft_tokens`, see the example section). +- **You have a smaller draft LLM**: Use **STANDALONE** (`--speculative-algorithm STANDALONE`). +- **No extra model available**: Use **NGRAM** (`--speculative-algorithm NGRAM`, CUDA-only). +- **Want overlap scheduler (experimental)**: Enable **SpecV2** with `SGLANG_ENABLE_SPEC_V2=True` (requires `--speculative-eagle-topk 1`). + +### Method comparison (mini table) + +| Method | Draft source | Separate draft model? | How to enable | Notes / constraints | +|---|---|---:|---|---| +| EAGLE-2 | EAGLE draft model (feature drafting + tree) | Typically yes | `--speculative-algorithm EAGLE` + `--speculative-draft-model-path ...` | Tune `--speculative-num-steps`, `--speculative-eagle-topk`, `--speculative-num-draft-tokens` | +| EAGLE-2 + `torch.compile` | Same as EAGLE-2 | Typically yes | Add `--enable-torch-compile` (optionally `--torch-compile-max-bs`) | Benefit varies by hardware/model; benchmark to verify | +| EAGLE-2 + FR-Spec | Same as EAGLE-2 + token subset | Typically yes | Add `--speculative-token-map ...` | Reduces `lm_head` overhead with high-frequency token vocab | +| EAGLE-3 | EAGLE3 draft model | Yes | `--speculative-algorithm EAGLE3` + `--speculative-draft-model-path ...` | Best throughput in the benchmark below | +| MTP | Built-in multi-token heads (model-specific) | Often no | See **Multi Token Prediction** section | Uses speculative workflow; draft path may be auto-handled for some models | +| STANDALONE | Smaller draft LLM (token-level) | Yes | `--speculative-algorithm STANDALONE` + `--speculative-draft-model-path ...` | Does **not** support `--enable-dp-attention` | +| SpecV2 (experimental) | V2 workers + overlap scheduler | N/A | `SGLANG_ENABLE_SPEC_V2=True` | Only supports `--speculative-eagle-topk 1`; applies to `EAGLE`, `EAGLE3`, `STANDALONE` | +| NGRAM | Ngram cache from previous tokens | No | `--speculative-algorithm NGRAM` | CUDA-only; no `--enable-dp-attention`; disables overlap scheduler & mixed chunked prefill | + +### Performance Highlights + +Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding. +For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840). + +| Method | Throughput (tokens/s) | +|--------|----------------| +| SGLang (w/o speculative, 1x H100) | 158.34 tokens/s | +| SGLang + EAGLE-2 (1x H100) | 244.10 tokens/s | +| SGLang + EAGLE-3 (1x H100) | 373.25 tokens/s | + +--- + +## EAGLE Decoding + +To enable EAGLE speculative decoding the following parameters are relevant: + +| Parameter | Description | Default | +|---|---|---| +| `--speculative-draft-model-path` | Draft model path/weights. **Typically required** for EAGLE/EAGLE3 and STANDALONE. For some MTP-enabled models, this can be omitted. | `None` | +| `--speculative-num-steps` | Depth of autoregressive drafting. Increases speculation range but risks rejection cascades. | Auto (`5` for Llama/Grok; `3` for many other models) | +| `--speculative-eagle-topk` | Branching factor per step. Improves candidate diversity and acceptance rate, but increases memory/compute consumption. | Auto (`4` for Llama/Grok; `1` for many other models) | +| `--speculative-num-draft-tokens` | Maximum parallel verification capacity. Allows deeper tree evaluation but increases GPU memory usage. | Auto (`8` for Llama/Grok; `4` for many other models). If `topk=1`, it is adjusted to `num_steps + 1`. | +| `--speculative-accept-threshold-single` | Acceptance threshold for single-token verification. Lower values accept more aggressively. | `1.0` | +| `--speculative-accept-threshold-acc` | Accumulated acceptance threshold across steps. | `1.0` | +| `--speculative-attention-mode` | Attention mode for speculative operations (`prefill` or `decode`), affecting both target verification and draft extension. | `"prefill"` | +| `--speculative-draft-attention-backend` | Override attention backend for the draft model. | `None` (same as target) | +| `--speculative-draft-model-quantization` | Quantization method for the draft model. Use `"unquant"` to force no quantization even when the target model is quantized. | Same as target model | +| `--speculative-draft-model-revision` | Specific revision/commit of the draft model to load. | `None` (auto-set to `"main"` when `--speculative-draft-model-path` is set and revision is omitted) | +| `--speculative-draft-load-format` | Load format for the draft model weights. | `None` | + +These parameters are mostly the same for EAGLE-2 and EAGLE-3. `--speculative-token-map` is ignored for EAGLE-3 models. +For `--speculative-num-steps`, `--speculative-eagle-topk`, and `--speculative-num-draft-tokens`: leave all three unset to use auto-tuning, or set all three explicitly when tuning. + +You can find the best combinations of these parameters with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py). + + +### EAGLE-2 Decoding + +You can enable EAGLE-2 Decoding by setting `--speculative-algorithm EAGLE` and choosing an appropriate model. + +**Launch the server:** + +```bash +python3 -m sglang.launch_server \ + --model meta-llama/Llama-2-7b-chat-hf \ + --speculative-algorithm EAGLE \ + --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 4 \ + --speculative-num-draft-tokens 16 \ + --mem-fraction-static 0.7 \ + --cuda-graph-max-bs 8 \ + --log-level warning +``` + +**Send a request:** + +```python +import openai + +client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="meta-llama/Llama-2-7b-chat-hf", + messages=[ + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=0, + max_tokens=64, +) + +print(response.choices[0].message.content) +``` + +--- + +### EAGLE-2 Decoding with `torch.compile` + +You can optionally enable `torch.compile` to apply kernel-level optimizations (operator fusion, autotune) to the draft model. The actual speedup depends on your hardware, model architecture, and batch size. In some configurations (e.g., small draft models on H100 where cuBLAS is already optimal and CUDA graphs are enabled), the benefit may be negligible. We recommend benchmarking with and without this flag on your specific setup to verify whether it helps. + +To enable it, add `--enable-torch-compile` and optionally set `--torch-compile-max-bs`: + +```bash +python3 -m sglang.launch_server \ + --model meta-llama/Llama-2-7b-chat-hf \ + --speculative-algorithm EAGLE \ + --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 4 \ + --speculative-num-draft-tokens 16 \ + --mem-fraction-static 0.7 \ + --enable-torch-compile \ + --torch-compile-max-bs 8 \ + --log-level warning +``` + +**Send a request:** + +```python +import openai + +client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="meta-llama/Llama-2-7b-chat-hf", + messages=[ + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=0, + max_tokens=64, +) + +print(response.choices[0].message.content) +``` + +--- + +### EAGLE-2 Decoding via Frequency-Ranked Speculative Sampling + +By employing a truncated high-frequency token vocabulary in the draft model, EAGLE speculative decoding reduces `lm_head` computational overhead while accelerating the pipeline without quality degradation. For more details, check out [the paper](https://arxiv.org/pdf/2502.14856). + +In our implementation, set `--speculative-token-map` to enable the optimization. You can get the high-frequency tokens in FR-Spec from [this model](https://huggingface.co/thunlp/LLaMA3-Instruct-8B-FR-Spec). Or you can obtain high-frequency tokens by directly downloading these tokens from [this repo](https://github.com/thunlp/FR-Spec/tree/main?tab=readme-ov-file#prepare-fr-spec-vocabulary-subset). + +Thanks for the contribution from [Weilin Zhao](https://github.com/Achazwl) and [Zhousx](https://github.com/Zhou-sx). + +```bash +python3 -m sglang.launch_server \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --speculative-algorithm EAGLE \ + --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 4 \ + --speculative-num-draft-tokens 16 \ + --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \ + --mem-fraction-static 0.7 \ + --cuda-graph-max-bs 8 \ + --dtype float16 \ + --log-level warning +``` + +**Send a request:** + +```python +import openai + +client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=0, + max_tokens=64, +) + +print(response.choices[0].message.content) +``` + +--- + +### EAGLE-3 Decoding + +You can enable EAGLE-3 decoding by setting `--speculative-algorithm EAGLE3` and choosing an appropriate model. + +```bash +python3 -m sglang.launch_server \ + --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --speculative-algorithm EAGLE3 \ + --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 4 \ + --speculative-num-draft-tokens 16 \ + --mem-fraction-static 0.7 \ + --cuda-graph-max-bs 8 \ + --dtype float16 \ + --log-level warning +``` + +**Send a request:** + +```python +import openai + +client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + messages=[ + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=0, + max_tokens=64, +) + +print(response.choices[0].message.content) +``` + +--- + +## Multi Token Prediction + +We support [MTP (Multi-Token Prediction)](https://arxiv.org/pdf/2404.19737) in SGLang by using speculative decoding. We use `XiaomiMiMo/MiMo-7B-RL` as an example here (for DeepSeek MTP usage, refer to [deepseek_v32 doc](../basic_usage/deepseek_v32.md#multi-token-prediction)). + +```bash +python3 -m sglang.launch_server \ + --model XiaomiMiMo/MiMo-7B-RL \ + --host 0.0.0.0 \ + --trust-remote-code \ + --speculative-algorithm EAGLE \ + --speculative-num-steps 1 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 2 \ + --mem-fraction-static 0.7 \ + --cuda-graph-max-bs 8 \ + --log-level warning +``` + +**Send a request:** + +```python +import requests + +url = "http://localhost:30000/v1/chat/completions" + +data = { + "model": "XiaomiMiMo/MiMo-7B-RL", + "messages": [{"role": "user", "content": "What is the capital of France?"}], +} + +response = requests.post(url, json=data) +print(response.json()) +``` + +--- + +## Standalone Speculative Decoding (Small Draft Model) + +Besides EAGLE/MTP, SGLang also supports **token-level speculative decoding** using a smaller **draft model**. Enable it with `--speculative-algorithm STANDALONE` and provide a draft model via `--speculative-draft-model-path`. + +Relevant parameters: + +| Parameter | Description | Default | +|---|---|---| +| `--speculative-draft-model-path` | Draft model weights (smaller than the target model). | `None` | +| `--speculative-num-steps` | Draft depth (how many steps the draft model runs autoregressively). | `3` (auto default for STANDALONE) | +| `--speculative-eagle-topk` | Branching factor (token candidates per step). | `1` (auto default for STANDALONE) | +| `--speculative-num-draft-tokens` | Verification capacity. | `4` (auto default for STANDALONE) | +| `--speculative-draft-model-quantization` | Quantization for the draft model. Use `"unquant"` to disable quantization on the draft even when the target is quantized. | Same as target | + +> **Note:** Standalone speculative decoding currently **does not support** `--enable-dp-attention`. + +```bash +python3 -m sglang.launch_server \ + --model Qwen/Qwen2.5-7B-Instruct \ + --speculative-algorithm STANDALONE \ + --speculative-draft-model-path Qwen/Qwen2.5-1.5B-Instruct \ + --speculative-num-steps 4 \ + --speculative-eagle-topk 2 \ + --speculative-num-draft-tokens 7 \ + --mem-fraction-static 0.7 \ + --cuda-graph-max-bs 8 \ + --log-level warning +``` + +**Send a request:** + +```python +import openai + +client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct", + messages=[ + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=0, + max_tokens=64, +) + +print(response.choices[0].message.content) +``` + +--- + +## Speculative Decoding V2 (Overlap Scheduler) + +SGLang provides an **experimental Speculative Decoding V2** implementation that enables an overlap scheduler and uses V2 speculative workers (e.g. `StandaloneWorkerV2`, `EAGLEWorkerV2`). + +To enable it, set the environment variable: +- `SGLANG_ENABLE_SPEC_V2=True` + +Notes: +- SpecV2 currently only supports `--speculative-eagle-topk 1`. When SpecV2 is enabled, **set `--speculative-eagle-topk 1` explicitly**. +- If you explicitly set `--speculative-eagle-topk > 1`, the server will error. +- If you omit `--speculative-eagle-topk`, auto-tuning may pick `topk > 1` for some models (e.g. Llama). This is incompatible with SpecV2 and may not always trigger an immediate config error, so set `--speculative-eagle-topk 1` explicitly. +- This applies to `EAGLE`, `EAGLE3`, and `STANDALONE`. + +```bash +SGLANG_ENABLE_SPEC_V2=True python3 -m sglang.launch_server \ + --model Qwen/Qwen2.5-7B-Instruct \ + --speculative-algorithm STANDALONE \ + --speculative-draft-model-path Qwen/Qwen2.5-1.5B-Instruct \ + --speculative-num-steps 4 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 5 \ + --mem-fraction-static 0.7 \ + --cuda-graph-max-bs 8 \ + --log-level warning +``` + +**Send a request:** + +```python +import openai + +client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct", + messages=[ + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=0, + max_tokens=64, +) + +print(response.choices[0].message.content) +``` + +--- + +## Ngram Speculative Decoding + +SGLang also supports **ngram-based speculative decoding** (no separate draft model). It retrieves draft tokens from an ngram cache built from previously generated tokens, and then verifies them with the target model. + +Enable it with: +- `--speculative-algorithm NGRAM` + +### Ngram-specific parameters + +| Parameter | Description | Default | +|---|---|---| +| `--speculative-num-draft-tokens` | Number of draft tokens verified per step. If omitted, defaults to `--speculative-ngram-max-match-window-size`. | `12` (with default ngram settings) | +| `--speculative-ngram-min-match-window-size` | Minimum matching window size. | `1` | +| `--speculative-ngram-max-match-window-size` | Maximum matching window size. | `12` | +| `--speculative-ngram-min-bfs-breadth` | Minimum BFS breadth. | `1` | +| `--speculative-ngram-max-bfs-breadth` | Maximum BFS breadth. | `10` | +| `--speculative-ngram-match-type` | Match type: `"BFS"` or `"PROB"`. | `"BFS"` | +| `--speculative-ngram-branch-length` | How many recent tokens to insert into the cache. | `18` | +| `--speculative-ngram-capacity` | Cache capacity (number of entries). | `10,000,000` | + +Notes: +- Ngram speculative decoding **only supports CUDA**. +- It currently **does not support** `--enable-dp-attention`. +- It disables the overlap scheduler and mixed chunked prefill. +- If `--speculative-ngram-max-bfs-breadth > 1` (thus `speculative_eagle_topk > 1`) and `page_size > 1`, use `--attention-backend flashinfer`; otherwise the server will error. +- Optional: set `SGLANG_NGRAM_FORCE_GREEDY_VERIFY=True` to force greedy verification. + +```bash +python3 -m sglang.launch_server \ + --model Qwen/Qwen2.5-7B-Instruct \ + --speculative-algorithm NGRAM \ + --speculative-num-draft-tokens 16 \ + --speculative-ngram-max-match-window-size 12 \ + --speculative-ngram-max-bfs-breadth 10 \ + --mem-fraction-static 0.7 \ + --cuda-graph-max-bs 8 \ + --log-level warning +``` + +**Send a request:** + +```python +import openai + +client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct", + messages=[ + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=0, + max_tokens=64, +) + +print(response.choices[0].message.content) +``` + +--- + +## Full Parameter Reference + +Below is a comprehensive list of all speculative decoding parameters available in SGLang: + +### Core parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `--speculative-algorithm` | `str` | `None` | Algorithm to use: `EAGLE`, `EAGLE3`, `STANDALONE`, `NGRAM`, `NEXTN` (alias of `EAGLE`) | +| `--speculative-draft-model-path` | `str` | `None` | Path to the draft model weights | +| `--speculative-draft-model-revision` | `str` | `None` | Specific revision/commit of the draft model (`"main"` is auto-used when draft path is set and revision is omitted) | +| `--speculative-draft-load-format` | `str` | `None` | Load format for draft model weights | +| `--speculative-num-steps` | `int` | `None` (auto-chosen when omitted) | Autoregressive drafting depth | +| `--speculative-eagle-topk` | `int` | `None` (auto-chosen when omitted) | Branching factor per drafting step | +| `--speculative-num-draft-tokens` | `int` | `None` (auto-chosen when omitted) | Maximum number of draft tokens for verification | +| `--speculative-accept-threshold-single` | `float` | `1.0` | Single-token acceptance threshold | +| `--speculative-accept-threshold-acc` | `float` | `1.0` | Accumulated acceptance threshold | +| `--speculative-token-map` | `str` | `None` | Path to FR-Spec high-frequency token map | +| `--speculative-attention-mode` | `str` | `"prefill"` | Attention mode for speculative operations (`"prefill"` or `"decode"`) | +| `--speculative-draft-attention-backend` | `str` | `None` | Override attention backend for the draft model | +| `--speculative-moe-runner-backend` | `str` | `None` | MoE runner backend for the draft model | +| `--speculative-moe-a2a-backend` | `str` | `None` | MoE all-to-all backend for the draft model | +| `--speculative-draft-model-quantization` | `str` | Same as target | Quantization for the draft model (`"unquant"` to disable) | + +### Ngram-specific parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `--speculative-ngram-min-match-window-size` | `int` | `1` | Minimum ngram matching window | +| `--speculative-ngram-max-match-window-size` | `int` | `12` | Maximum ngram matching window | +| `--speculative-ngram-min-bfs-breadth` | `int` | `1` | Minimum BFS breadth | +| `--speculative-ngram-max-bfs-breadth` | `int` | `10` | Maximum BFS breadth | +| `--speculative-ngram-match-type` | `str` | `"BFS"` | Match type: `"BFS"` or `"PROB"` | +| `--speculative-ngram-branch-length` | `int` | `18` | Recent tokens to insert into cache | +| `--speculative-ngram-capacity` | `int` | `10,000,000` | Cache capacity | + +### Environment variables + +| Variable | Default | Description | +|---|---|---| +| `SGLANG_ENABLE_SPEC_V2` | `False` | Enable Speculative Decoding V2 (overlap scheduler) | +| `SGLANG_NGRAM_FORCE_GREEDY_VERIFY` | `False` | Force greedy verification for ngram decoding | + +### Other related flags + +| Parameter | Description | +|---|---| +| `--enable-multi-layer-eagle` | Enable multi-layer EAGLE (auto-enabled for MiMoV2 and Step3p5 models) | +| `--enable-torch-compile` | Enable `torch.compile` for kernel-level optimizations | +| `--torch-compile-max-bs` | Maximum batch size for `torch.compile` | + +--- + +## OOM Troubleshooting + +> [!WARNING] +> **Out of Memory (OOM)?** Speculative decoding may increase GPU memory usage because the draft tree, CUDA graphs, and verification-related buffers consume additional VRAM. If you encounter OOM errors, try the following adjustments. + +### Step 1: Lower static memory fraction (most effective) + +```bash +--mem-fraction-static 0.5 # when omitted, this value is auto-computed +``` + +- `--mem-fraction-static` controls the memory budget for model weights + KV cache pool. +- Lowering it directly increases dynamic headroom for activations and CUDA graph buffers. +- If omitted, SGLang auto-estimates this value from other settings, and those auto settings can still be too aggressive for some workloads. + +### Step 2: Reduce CUDA graph batch size + +```bash +# Fewer CUDA graph captures = less memory reserved +--cuda-graph-max-bs 4 # or even 2 for tight memory situations +``` + +- If omitted, `--cuda-graph-max-bs` is auto-selected based on GPU memory and TP size, and can be much larger on high-memory GPUs. + +### Step 3: Reduce draft tree size + +These three parameters directly control how much memory the draft tree consumes: + +```bash +# Before (aggressive, high memory) +--speculative-num-steps 5 --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 + +# After (conservative, lower memory) +--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 +``` + +### Step 4: Limit concurrent requests + +```bash +# Fewer concurrent requests lowers in-flight load and can reduce OOM risk +--max-running-requests 4 +``` + +### Quick OOM recovery recipe + +If you're hitting OOM and just want something that works, start with this minimal configuration and scale up: + +```bash +python3 -m sglang.launch_server \ + --model \ + --speculative-algorithm EAGLE \ + --speculative-draft-model-path \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --cuda-graph-max-bs 2 \ + --mem-fraction-static 0.5 \ + --max-running-requests 4 \ + --log-level warning +``` + +Then gradually increase `--speculative-num-draft-tokens`, `--speculative-eagle-topk`, and `--cuda-graph-max-bs`. Increase `--mem-fraction-static` last, only after the run is stable. + +--- + +## References + +EAGLE process is as follows: + +- Within EAGLE the draft model predicts the next feature vector, i.e. the last hidden state of the original LLM, using the feature sequence $(f_1, ..., f_k)$ and the token sequence $(t_2, ..., t_{k+1})$. +- The next token is then sampled from $p_{k+2}=\text{LMHead}(f_{k+1})$. Afterwards, the two sequences are extended in a tree style—branching out multiple potential continuations, with the branching factor per step controlled by the `speculative_eagle_topk` parameter—to ensure a more coherent connection of context, and are given as input again. +- In SGLang's EAGLE-2 implementation, the draft tree is expanded for the configured steps and then reranked to select the top `speculative_num_draft_tokens` final nodes as draft tokens. +- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner. + +This enhances drafting accuracy by operating on features instead of tokens for more regular inputs and by additionally passing tokens from the next timestep to reduce sampling randomness. For more details, see the [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) papers. + +For guidance on how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train). For EAGLE-3 training specifically, check out [SpecForge](https://github.com/sgl-project/SpecForge), the SGLang team's training framework designed for EAGLE-3 speculative decoding models with seamless porting to SGLang serving. See the [SpecForge documentation](https://docs.sglang.ai/SpecForge/) and [blog post](https://lmsys.org/blog/2025-07-25-spec-forge) for details. diff --git a/docs/advanced_features/structured_outputs.ipynb b/docs/advanced_features/structured_outputs.ipynb index b0ec5e6c7d61..ec603e4e62bf 100644 --- a/docs/advanced_features/structured_outputs.ipynb +++ b/docs/advanced_features/structured_outputs.ipynb @@ -54,7 +54,7 @@ " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n", ")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")\n", + "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n", "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")" ] }, @@ -740,7 +740,6 @@ "import json\n", "from pydantic import BaseModel, Field\n", "\n", - "\n", "prompts = [\n", " \"Give me the information of the capital of China in the JSON format.\",\n", " \"Give me the information of the capital of France in the JSON format.\",\n", diff --git a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb index 2b05a583775c..cfc07fd01629 100644 --- a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb +++ b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb @@ -50,7 +50,7 @@ " \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n", ")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")\n", + "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n", "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")" ] }, @@ -642,7 +642,6 @@ "import json\n", "from pydantic import BaseModel, Field\n", "\n", - "\n", "prompts = [\n", " \"Give me the information of the capital of China in the JSON format.\",\n", " \"Give me the information of the capital of France in the JSON format.\",\n", diff --git a/docs/advanced_features/tool_parser.ipynb b/docs/advanced_features/tool_parser.ipynb index df1bc4bc7ba0..9afc9663e64f 100644 --- a/docs/advanced_features/tool_parser.ipynb +++ b/docs/advanced_features/tool_parser.ipynb @@ -60,7 +60,7 @@ "server_process, port = launch_server_cmd(\n", " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\" # qwen25\n", ")\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=server_process)" ] }, { @@ -550,7 +550,9 @@ "server_process_tool_choice, port_tool_choice = launch_server_cmd(\n", " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"\n", ")\n", - "wait_for_server(f\"http://localhost:{port_tool_choice}\")\n", + "wait_for_server(\n", + " f\"http://localhost:{port_tool_choice}\", process=server_process_tool_choice\n", + ")\n", "\n", "# Initialize client for tool choice examples\n", "client_tool_choice = OpenAI(\n", @@ -695,7 +697,7 @@ "server_process, port = launch_server_cmd(\n", " \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1 --log-level warning\" # llama-3.2-1b-instruct\n", ")\n", - "wait_for_server(f\"http://localhost:{port}\")\n", + "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n", "\n", "tools = [\n", " {\n", diff --git a/docs/advanced_features/vlm_query.ipynb b/docs/advanced_features/vlm_query.ipynb index 45dd9a1efe01..f5497f5b713f 100644 --- a/docs/advanced_features/vlm_query.ipynb +++ b/docs/advanced_features/vlm_query.ipynb @@ -65,7 +65,8 @@ "nest_asyncio.apply()\n", "\n", "model_path = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n", - "chat_template = \"qwen2-vl\"" + "chat_template = \"qwen2-vl\"\n", + "example_image_url = \"https://raw.githubusercontent.com/sgl-project/sglang/main/examples/assets/example_image.png\"" ] }, { @@ -81,13 +82,7 @@ "\n", "from sglang.srt.parser.conversation import chat_templates\n", "\n", - "image = Image.open(\n", - " BytesIO(\n", - " requests.get(\n", - " \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n", - " ).content\n", - " )\n", - ")\n", + "image = Image.open(BytesIO(requests.get(example_image_url).content))\n", "\n", "conv = chat_templates[chat_template].copy()\n", "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n", @@ -117,7 +112,6 @@ "source": [ "from sglang import Engine\n", "\n", - "\n", "llm = Engine(model_path=model_path, chat_template=chat_template, log_level=\"warning\")" ] }, @@ -239,13 +233,7 @@ "from sglang.srt.parser.conversation import chat_templates\n", "\n", "# Download the same example image\n", - "image = Image.open(\n", - " BytesIO(\n", - " requests.get(\n", - " \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n", - " ).content\n", - " )\n", - ")\n", + "image = Image.open(BytesIO(requests.get(example_image_url).content))\n", "\n", "conv = chat_templates[chat_template].copy()\n", "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n", diff --git a/docs/basic_usage/deepseek_v3.md b/docs/basic_usage/deepseek_v3.md index a321eb09cbb7..0fd06baedc16 100644 --- a/docs/basic_usage/deepseek_v3.md +++ b/docs/basic_usage/deepseek_v3.md @@ -223,7 +223,7 @@ Sample Request: ``` curl "http://127.0.0.1:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324", "tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of an city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "Hows the weather like in Qingdao today"}]}' +-d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324", "tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of a city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "How'\''s the weather like in Qingdao today"}]}' ``` Expected Response @@ -236,7 +236,7 @@ Sample Streaming Request: ``` curl "http://127.0.0.1:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324","stream":true,"tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of an city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "Hows the weather like in Qingdao today"}]}' +-d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324","stream":true,"tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of a city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "How'\''s the weather like in Qingdao today"}]}' ``` Expected Streamed Chunks (simplified for clarity): ``` diff --git a/docs/basic_usage/deepseek_v32.md b/docs/basic_usage/deepseek_v32.md index ca38db221f54..4894954e7056 100644 --- a/docs/basic_usage/deepseek_v32.md +++ b/docs/basic_usage/deepseek_v32.md @@ -66,9 +66,13 @@ python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --n - `fa3`: `flash_attn_with_kvcache` kernel from `flash_attn` library. Can only run on Hopper GPUs. It requires bf16 q, kv inputs. - `tilelang`: `tilelang` implementation that can run on GPU, HPU and NPU. - `aiter`: Aiter kernel on AMD HPUs. Can only be used as decode kernel. + - `trtllm`: `trtllm-mla` sparse kernel from flashinfer library. Only run on blackwell GPUs. It requires QKV bf16 or QKV fp8. - On the basis of performance benchmarks, the default configuration on H200 and B200 are set as follows : - H200: `flashmla_sparse` prefill attention (short-seq prefill uses MHA via FlashAttention varlen), `fa3` decode attention, `bf16` kv cache dtype. - B200: `flashmla_auto` prefill attention (short-seq prefill uses MHA via TRT-LLM ragged), `flashmla_kv` decode attention, `fp8_e4m3` kv cache dtype. `flashmla_auto` enables automatic selection of either `flashmla_sparse` or `flashmla_kv` kernel for prefill based on KV cache dtype, hardware, and heuristics. When FP8 KV cache is enabled and `total_kv_tokens < total_q_tokens * 512`, it uses the `flashmla_sparse` kernel; otherwise, it falls back to the `flashmla_kv` kernel. The heuristics may need to be tuned if the performance of either the `flashmla_sparse` or `flashmla_kv` kernel changes significantly. +- On Blackwell platform, with slightly accuracy drop, the performance can boost up to 3x-5x + - B200: by choosing `trtllm` for both `--nsa-prefill-backend` and `--nsa-decode-backend`, the prefill attention use MHA via TRT-LLM ragged for both short and long sequence (**accuracy impact**). Combine the `trtllm` with `fp8_e4m3` kv cache, the kv cache dim is `576` (kv_lora_rank + qk_rope_head_dim) (**accuracy impact**), compare to the combination of `flashmla_auto` and `fp8_e4m` kv cache dim is `656` (kv_lora_rank + scale storage (kv_lora_rank // quant_block_size * 4 bytes) + rope dimension storage). + ## Multi-token Prediction SGLang implements Multi-Token Prediction (MTP) for DeepSeek V3.2 based on [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved significantly on small batch sizes. Please look at [this PR](https://github.com/sgl-project/sglang/pull/11652) for more information. @@ -306,16 +310,13 @@ DeepSeek-V3.2-Speciale: For context parallel in DeepSeek V3.2 model, we provide two different modes of splitting tokens, which can be controlled with argument `--nsa-prefill-cp-mode`. -### In sequence splitting (default setting) - -The first mode can be enabled by `--nsa-prefill-cp-mode in-seq-split`. This mode implements context parallel for DSA by splitting the sequence uniformly between context parallel ranks. At attention stage, each cp rank computes the indexer results of sharded sequence, and collects the whole kv cache through all gather operator. +### In sequence splitting -The communication group for context parallel reuses the one for attention tp, thus `cp_size` equals `atten_tp_size = tp_size / dp_size`. +The first mode can be enabled by `--nsa-prefill-cp-mode in-seq-split`. This mode implements context parallel for DSA by splitting the sequence uniformly between context parallel ranks. At attention stage, each cp rank computes the indexer results of sharded sequence, and collects the whole kv cache through all gather operator. Add `attn_cp_size` for communication group for context parallel. Note that in sequence splitting mode has the following restrictions: - The batch size is restricted to 1 for prefill batches -- Multi-node/PD disaggregation is still not supported -- `moe_dense_tp_size=1`, `kv_cache_dtype = "bf16"`, `moe_a2a_backend = "deepep"` +- `moe_dense_tp_size=1`, `moe_a2a_backend = "deepep"` - To ensure `cp_size > 1`, the passed in `tp_size` must be larger than `dp_size` For more details, please refer to PR https://github.com/sgl-project/sglang/pull/12065. @@ -323,10 +324,10 @@ For more details, please refer to PR https://github.com/sgl-project/sglang/pull/ Example: ```bash # In-seq splitting mode launched with EP + DP -python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --ep 8 --dp 2 --enable-dp-attention --enable-nsa-prefill-context-parallel --nsa-prefill-cp-mode in-seq-split --max-running-requests 32 +python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --ep 8 --dp 2 --enable-dp-attention --enable-nsa-prefill-context-parallel --attn-cp-size 4 --nsa-prefill-cp-mode in-seq-split --max-running-requests 32 ``` -### Round robin splitting +### Round robin splitting (default setting) This mode can be enabled by specifying the parameter `--nsa-prefill-cp-mode round-robin-split`, which distributes tokens across ranks based on `token_idx % cp_size`. @@ -337,7 +338,7 @@ For more details, please refer to PR https://github.com/sgl-project/sglang/pull/ Example usage: ```bash # Launch with FusedMoe + CP8 -python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --enable-nsa-prefill-context-parallel --nsa-prefill-cp-mode round-robin-split --max-running-requests 32 +python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --enable-nsa-prefill-context-parallel --attn-cp-size 8 --nsa-prefill-cp-mode round-robin-split --max-running-requests 32 ``` ### Pipeline Parallel + Context Parallel (PP + CP) @@ -361,6 +362,7 @@ python3 -m sglang.launch_server \ --tp 8 --pp-size 2 \ --dp-size 1 --moe-dense-tp-size 1 \ --enable-nsa-prefill-context-parallel \ + --attn-cp-size 8 \ --nsa-prefill-cp-mode round-robin-split \ --trust-remote-code \ --disable-radix-cache \ @@ -384,6 +386,7 @@ python3 -m sglang.launch_server \ --tp 8 --pp-size 2 \ --dp-size 1 --moe-dense-tp-size 1 \ --enable-nsa-prefill-context-parallel \ + --attn-cp-size 8 \ --nsa-prefill-cp-mode round-robin-split \ --trust-remote-code \ --disable-radix-cache \ @@ -411,6 +414,7 @@ python -m sglang.launch_server \ --tp 8 --pp-size 2 \ --dp-size 1 --moe-dense-tp-size 1 \ --enable-nsa-prefill-context-parallel \ + --attn-cp-size 8 \ --nsa-prefill-cp-mode round-robin-split \ --disaggregation-ib-device mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 \ --trust-remote-code \ @@ -436,6 +440,7 @@ python -m sglang.launch_server \ --tp 8 --pp-size 2 \ --dp-size 1 --moe-dense-tp-size 1 \ --enable-nsa-prefill-context-parallel \ + --attn-cp-size 8 \ --nsa-prefill-cp-mode round-robin-split \ --disaggregation-ib-device mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 \ --trust-remote-code \ diff --git a/docs/basic_usage/glmv.md b/docs/basic_usage/glmv.md index c56b6ecd54cb..ad36cea26ad2 100644 --- a/docs/basic_usage/glmv.md +++ b/docs/basic_usage/glmv.md @@ -133,4 +133,4 @@ python -m sglang.launch_server \ In SGLang, we can implement thinking budget with `CustomLogitProcessor`. -Launch a server with `--enable-custom-logit-processor` flag on. and using `Glm4MoeThinkingBudgetLogitProcessor` in the request likes `GLM-4.6` example in [glm45.md](./glm45.md). +Launch a server with the `--enable-custom-logit-processor` flag. Then, use `Glm4MoeThinkingBudgetLogitProcessor` in the request, similar to the `GLM-4.6` example in [glm45.md](./glm45.md). diff --git a/docs/basic_usage/gpt_oss.md b/docs/basic_usage/gpt_oss.md index f74ba40d90ae..9f81da4f4ee9 100644 --- a/docs/basic_usage/gpt_oss.md +++ b/docs/basic_usage/gpt_oss.md @@ -105,7 +105,7 @@ print(response.output_text) # Test python tool response = client.responses.create( model="openai/gpt-oss-120b", - instructions="You are a helfpul assistant, you could use python tool to execute code.", + instructions="You are a helpful assistant, you could use python tool to execute code.", input="Use python tool to calculate the sum of 29138749187 and 29138749187", # 58,277,498,374 tools=tools ) @@ -115,7 +115,7 @@ print(response.output_text) # Test browser tool response = client.responses.create( model="openai/gpt-oss-120b", - instructions="You are a helfpul assistant, you could use browser to search the web", + instructions="You are a helpful assistant, you could use browser to search the web", input="Search the web for the latest news about Nvidia stock price", tools=tools ) diff --git a/docs/basic_usage/minimax_m2.md b/docs/basic_usage/minimax_m2.md index 33d445790a6f..7ca6ed809fcb 100644 --- a/docs/basic_usage/minimax_m2.md +++ b/docs/basic_usage/minimax_m2.md @@ -1,13 +1,14 @@ -# MiniMax M2.1/M2 Usage +# MiniMax M2.5/M2.1/M2 Usage -[MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1) and [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) are advanced large language models created by [MiniMax](https://www.minimax.io/). +[MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5), [MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1), and [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) are advanced large language models created by [MiniMax](https://www.minimax.io/). -MiniMax-M2 series redefines efficiency for agents. It's a compact, fast, and cost-effective MoE model (230 billion total parameters with 10 billion active parameters) built for elite performance in coding and agentic tasks, all while maintaining powerful general intelligence. With just 10 billion activated parameters, MiniMax-M2 provides the sophisticated, end-to-end tool use performance expected from today's leading models, but in a streamlined form factor that makes deployment and scaling easier than ever. +The MiniMax-M2 series redefines efficiency for agents. These compact, fast, and cost-effective MoE models (230 billion total parameters with 10 billion active parameters) are built for elite performance in coding and agentic tasks, all while maintaining powerful general intelligence. With just 10 billion activated parameters, the MiniMax-M2 series provides sophisticated, end-to-end tool use performance expected from today's leading models, but in a streamlined form factor that makes deployment and scaling easier than ever. ## Supported Models This guide applies to the following models. You only need to update the model name during deployment. The following examples use **MiniMax-M2**: +- [MiniMaxAI/MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5) - [MiniMaxAI/MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1) - [MiniMaxAI/MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) @@ -49,6 +50,24 @@ python -m sglang.launch_server \ --mem-fraction-static 0.85 ``` +### AMD GPUs (MI300X/MI325X/MI355X) + +8-GPU deployment command: + +```bash +SGLANG_USE_AITER=1 python -m sglang.launch_server \ + --model-path MiniMaxAI/MiniMax-M2.5 \ + --tp-size 8 \ + --ep-size 8 \ + --attention-backend aiter \ + --tool-call-parser minimax-m2 \ + --reasoning-parser minimax-append-think \ + --host 0.0.0.0 \ + --trust-remote-code \ + --port 8000 \ + --mem-fraction-static 0.85 +``` + ## Testing Deployment After startup, you can test the SGLang OpenAI-compatible API with the following command: diff --git a/docs/basic_usage/native_api.ipynb b/docs/basic_usage/native_api.ipynb index 52e4386af6dc..05f4f3688306 100644 --- a/docs/basic_usage/native_api.ipynb +++ b/docs/basic_usage/native_api.ipynb @@ -49,7 +49,7 @@ " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n", ")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=server_process)" ] }, { @@ -275,14 +275,12 @@ "metadata": {}, "outputs": [], "source": [ - "embedding_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "embedding_process, port = launch_server_cmd(\"\"\"\n", "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n", " --host 0.0.0.0 --is-embedding --log-level warning\n", - "\"\"\"\n", - ")\n", + "\"\"\")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=embedding_process)" ] }, { @@ -324,14 +322,12 @@ "metadata": {}, "outputs": [], "source": [ - "reranker_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "reranker_process, port = launch_server_cmd(\"\"\"\n", "python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n", " --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n", - "\"\"\"\n", - ")\n", + "\"\"\")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=reranker_process)" ] }, { @@ -392,14 +388,12 @@ "metadata": {}, "outputs": [], "source": [ - "score_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "score_process, port = launch_server_cmd(\"\"\"\n", "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n", " --host 0.0.0.0 --log-level warning\n", - "\"\"\"\n", - ")\n", + "\"\"\")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=score_process)" ] }, { @@ -456,13 +450,11 @@ "# Note that SGLang now treats embedding models and reward models as the same type of models.\n", "# This will be updated in the future.\n", "\n", - "reward_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "reward_process, port = launch_server_cmd(\"\"\"\n", "python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n", - "\"\"\"\n", - ")\n", + "\"\"\")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=reward_process)" ] }, { @@ -526,7 +518,7 @@ " \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n", ")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=expert_record_server_process)" ] }, { @@ -575,13 +567,11 @@ "metadata": {}, "outputs": [], "source": [ - "tokenizer_free_server_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "tokenizer_free_server_process, port = launch_server_cmd(\"\"\"\n", "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct\n", - "\"\"\"\n", - ")\n", + "\"\"\")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=tokenizer_free_server_process)" ] }, { diff --git a/docs/basic_usage/openai_api_completions.ipynb b/docs/basic_usage/openai_api_completions.ipynb index e89dfd57ff78..ffa576ae52c5 100644 --- a/docs/basic_usage/openai_api_completions.ipynb +++ b/docs/basic_usage/openai_api_completions.ipynb @@ -39,7 +39,7 @@ " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n", ")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")\n", + "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n", "print(f\"Server started on http://localhost:{port}\")" ] }, diff --git a/docs/basic_usage/openai_api_embeddings.ipynb b/docs/basic_usage/openai_api_embeddings.ipynb index 26e95a4e7c12..a6c90c06b5f0 100644 --- a/docs/basic_usage/openai_api_embeddings.ipynb +++ b/docs/basic_usage/openai_api_embeddings.ipynb @@ -9,7 +9,7 @@ "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n", "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/embeddings).\n", "\n", - "This tutorial covers the embedding APIs for embedding models. For a list of the supported models see the [corresponding overview page](../supported_models/embedding_models.md)\n" + "This tutorial covers the embedding APIs for embedding models. For a list of the supported models see the [corresponding overview page](../supported_models/retrieval_ranking/embedding_models.md)\n" ] }, { @@ -30,14 +30,12 @@ "from sglang.test.doc_patch import launch_server_cmd\n", "from sglang.utils import wait_for_server, print_highlight, terminate_process\n", "\n", - "embedding_process, port = launch_server_cmd(\n", - " \"\"\"\n", + "embedding_process, port = launch_server_cmd(\"\"\"\n", "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n", " --host 0.0.0.0 --is-embedding --log-level warning\n", - "\"\"\"\n", - ")\n", + "\"\"\")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "wait_for_server(f\"http://localhost:{port}\", process=embedding_process)" ] }, { @@ -173,7 +171,7 @@ "metadata": {}, "source": [ "## Multi-Modal Embedding Model\n", - "Please refer to [Multi-Modal Embedding Model](../supported_models/embedding_models.md)" + "Please refer to [Multi-Modal Embedding Model](../supported_models/retrieval_ranking/embedding_models.md)" ] } ], diff --git a/docs/basic_usage/openai_api_vision.ipynb b/docs/basic_usage/openai_api_vision.ipynb index 1db599dcfa90..b6e6a1a24eb3 100644 --- a/docs/basic_usage/openai_api_vision.ipynb +++ b/docs/basic_usage/openai_api_vision.ipynb @@ -10,7 +10,7 @@ "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/vision).\n", "This tutorial covers the vision APIs for vision language models.\n", "\n", - "SGLang supports various vision language models such as Llama 3.2, LLaVA-OneVision, Qwen2.5-VL, Gemma3 and [more](../supported_models/multimodal_language_models.md).\n", + "SGLang supports various vision language models such as Llama 3.2, LLaVA-OneVision, Qwen2.5-VL, Gemma3 and [more](../supported_models/text_generation/multimodal_language_models.md).\n", "\n", "As an alternative to the OpenAI API, you can also use the [SGLang offline engine](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py)." ] @@ -33,13 +33,16 @@ "from sglang.test.doc_patch import launch_server_cmd\n", "from sglang.utils import wait_for_server, print_highlight, terminate_process\n", "\n", - "vision_process, port = launch_server_cmd(\n", - " \"\"\"\n", - "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n", - "\"\"\"\n", + "example_image_url = \"https://raw.githubusercontent.com/sgl-project/sglang/main/examples/assets/example_image.png\"\n", + "logo_image_url = (\n", + " \"https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png\"\n", ")\n", "\n", - "wait_for_server(f\"http://localhost:{port}\")" + "vision_process, port = launch_server_cmd(\"\"\"\n", + "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n", + "\"\"\")\n", + "\n", + "wait_for_server(f\"http://localhost:{port}\", process=vision_process)" ] }, { @@ -75,7 +78,7 @@ " {{\n", " \"type\": \"image_url\",\n", " \"image_url\": {{\n", - " \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n", + " \"url\": \"{example_image_url}\"\n", " }}\n", " }}\n", " ]\n", @@ -119,9 +122,7 @@ " {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n", " {\n", " \"type\": \"image_url\",\n", - " \"image_url\": {\n", - " \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n", - " },\n", + " \"image_url\": {\"url\": example_image_url},\n", " },\n", " ],\n", " }\n", @@ -162,9 +163,7 @@ " },\n", " {\n", " \"type\": \"image_url\",\n", - " \"image_url\": {\n", - " \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n", - " },\n", + " \"image_url\": {\"url\": example_image_url},\n", " },\n", " ],\n", " }\n", @@ -203,13 +202,13 @@ " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", - " \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\",\n", + " \"url\": example_image_url,\n", " },\n", " },\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", - " \"url\": \"https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png\",\n", + " \"url\": logo_image_url,\n", " },\n", " },\n", " {\n", diff --git a/docs/basic_usage/popular_model_usage.rst b/docs/basic_usage/popular_model_usage.rst index 77697e8959e9..ec0268ed7cf2 100644 --- a/docs/basic_usage/popular_model_usage.rst +++ b/docs/basic_usage/popular_model_usage.rst @@ -13,6 +13,7 @@ For more usage examples and recipes, visit the `SGLang Cookbook = 5 * max-concurrency` to measure steady-state performance. Launch a server with `sglang.launch_server` first. + + ```bash + python3 -m sglang.bench_serving --backend sglang --max-concurrency 16 --num-prompts 80 --random-input-len 256 --random-output-len 32 --dataset-name random + ``` + +**`bench_one_batch_server`** sends a single batch as one HTTP request to a running server. Due to only having a single batch, the server is never in a steady-state and metrics will be biased. Launch a server with `sglang.launch_server` first. + + ```bash + python3 -m sglang.bench_one_batch_server --base-url http://127.0.0.1:30000 --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 32 --input-len 256 --output-len 32 + ``` + +**`bench_offline_throughput`** directly instantiates the `Engine` object in-process (no HTTP server) and submits all requests at once via `engine.generate()`. The engine's scheduler handles batching and execution. This measures maximum achievable throughput without any network overhead. ```bash python3 -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10 ``` -- Benchmark online serving. Please use `sglang.launch_server` to launch a server first and run the following command. +**`bench_one_batch`** is the lowest-level tool. It directly instantiates a `ModelRunner` and calls `extend()` / `decode()` on a fixed static batch, bypassing the scheduler entirely. The prefill and decode phases are run separately, making profiling easier but rendering the metrics unrealistic. Because there is no dynamic batching, it may run out of memory for batch sizes that a real server can handle (a real server chunks prefill into smaller batches). This is best suited for profiling individual kernel performance. ```bash - python3 -m sglang.bench_serving --backend sglang --num-prompt 10 + python3 -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 32 --input-len 256 --output-len 32 ``` ## Profile with PyTorch Profiler @@ -43,7 +54,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile ``` -Please make sure that the `SGLANG_TORCH_PROFILER_DIR` should be set at both server and client side, otherwise the trace file cannot be generated correctly . A secure way will be setting `SGLANG_TORCH_PROFILER_DIR` in the `.*rc` file of shell (e.g. `~/.bashrc` for bash shells). +The `SGLANG_TORCH_PROFILER_DIR` environment variable must be set on both the server and client side; otherwise, the trace file will not be generated correctly. A secure way to do this is by setting it in your shell's resource file (e.g., `~/.bashrc` for bash). For more details, please refer to [Bench Serving Guide](./bench_serving.md). diff --git a/docs/developer_guide/contribution_guide.md b/docs/developer_guide/contribution_guide.md index dde033771461..5f78456a25de 100644 --- a/docs/developer_guide/contribution_guide.md +++ b/docs/developer_guide/contribution_guide.md @@ -73,6 +73,8 @@ Then your PR can be merged. We have a lot of open PRs but limited CI machines, so only top and trusted contributors have permission to trigger CI tests. Users with permission are listed in the [CI_PERMISSIONS.json](https://github.com/sgl-project/sglang/blob/main/.github/CI_PERMISSIONS.json) +**PR authors** can always use `/rerun-failed-ci` on their own PRs, even if they are not listed in `CI_PERMISSIONS.json`. + For CI to run on a pull request, it must have the "run-ci" label. Authorized users can add the label or rerun failed tests by commenting on the PR with one of these commands: - `/tag-run-ci-label`: Adds the "run-ci" label. Every future commit will trigger CI. @@ -86,7 +88,7 @@ To avoid spamming a PR with too many `/rerun-failed-ci` comments, you can also t Example of rerunning a single test stage: `/rerun-stage unit-test-backend-4-gpu`. -If you don’t have permission, please ask maintainers to trigger CI for you. +If you don’t have permission and you’re not the PR author, please ask maintainers to trigger CI for you. ### CI rate limits diff --git a/docs/developer_guide/development_guide_using_docker.md b/docs/developer_guide/development_guide_using_docker.md index e38947902458..a833011c62b1 100644 --- a/docs/developer_guide/development_guide_using_docker.md +++ b/docs/developer_guide/development_guide_using_docker.md @@ -55,7 +55,7 @@ Some useful volumes to mount are: 1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`. 2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer. -Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer. +Example 1: Mounting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer. ```bash docker run -itd --shm-size 32g --gpus all -v /opt/dlami/nvme/.cache:/root/.cache --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh docker exec -it sglang_zhyncs /bin/zsh diff --git a/docs/developer_guide/development_jit_kernel_guide.md b/docs/developer_guide/development_jit_kernel_guide.md index 2fb3422748e9..ce0e9b8aabd1 100644 --- a/docs/developer_guide/development_jit_kernel_guide.md +++ b/docs/developer_guide/development_jit_kernel_guide.md @@ -257,3 +257,53 @@ from sglang.jit_kernel.add_constant import add_constant ``` For a complete, runnable example, refer to [test_add_constant.py](../../python/sglang/jit_kernel/tests/test_add_constant.py). + +## C++ Include Library Reference + +The JIT kernel framework provides a set of reusable C++ headers in +`python/sglang/jit_kernel/include/sgl_kernel/`. Each header is designed +to be lightweight and self-contained. Below is a summary of each header +and its key APIs. + +### Core Utilities + +| Header | Namespace | Purpose | +|--------|-----------|---------| +| `utils.h` | `host` | Host-side essentials: `RuntimeCheck`, `Panic`, `div_ceil`, `irange` | +| `utils.cuh` | `device` / `host` | Type aliases (`fp16_t`, `bf16_t`, ...), `SGL_DEVICE` macro, PDL helpers, `LaunchKernel`, `RuntimeDeviceCheck` | +| `source_location.h` | (global) | Portable `std::source_location` wrapper for error reporting | +| `runtime.cuh` | `host::runtime` | CUDA runtime queries: `get_blocks_per_sm`, `get_sm_count`, `get_cc_major`, `get_runtime_version`, `get_available_dynamic_smem_per_block` | + +### Tensor Validation + +| Header | Namespace | Purpose | +|--------|-----------|---------| +| `tensor.h` | `host` | `TensorMatcher`, `SymbolicSize`, `SymbolicDType`, `SymbolicDevice` | + +### Math & Type System + +| Header | Namespace | Purpose | +|--------|-----------|---------| +| `math.cuh` | `device::math` | `max`, `min`, `abs`, `sqrt`, `rsqrt`, `exp`, `sin`, `cos`, constants | +| `type.cuh` | (global) / `device` | `dtype_trait`, `packed_t`, `device::cast(from)` | + +### Memory Access + +| Header | Namespace | Purpose | +|--------|-----------|---------| +| `vec.cuh` | `device` | `AlignedVector` - vectorized load/store (up to 128-bit; 256-bit requires Blackwell GPUs) | +| `tile.cuh` | `device::tile` | `Memory` - cooperative tiled memory I/O (thread/warp/CTA) | + +### Parallel Primitives + +| Header | Namespace | Purpose | +|--------|-----------|---------| +| `warp.cuh` | `device::warp` | `reduce_sum`, `reduce_max` via `__shfl_xor_sync` | +| `cta.cuh` | `device::cta` | `reduce_max` across warps via shared memory | +| `atomic.cuh` | `device::atomic` | `max` - atomic float max (CUDA + ROCm fallback) | + +### Reusable Kernel Templates + +| Header | Namespace | Purpose | +|--------|-----------|---------| +| `impl/norm.cuh` | `host::norm` / `device::norm` | RMSNorm building blocks (warp & CTA paths, `StorageType`) | diff --git a/docs/developer_guide/evaluating_new_models.md b/docs/developer_guide/evaluating_new_models.md index 19965ed781f9..f3126c9a0d88 100644 --- a/docs/developer_guide/evaluating_new_models.md +++ b/docs/developer_guide/evaluating_new_models.md @@ -26,7 +26,7 @@ python -m sglang.test.run_eval \ ```bash python -m sglang.test.few_shot_gsm8k \ - --host http://127.0.0.1 \ + --host 127.0.0.1 \ --port 30000 \ --num-questions 200 \ --num-shots 5 @@ -36,7 +36,7 @@ python -m sglang.test.few_shot_gsm8k \ ```bash python benchmark/hellaswag/bench_sglang.py \ - --host http://127.0.0.1 \ + --host 127.0.0.1 \ --port 30000 \ --num-questions 200 \ --num-shots 20 @@ -54,7 +54,7 @@ python -m sglang.test.run_eval \ ``` ```{tip} -For reasoning models, add `--thinking-mode ` (e.g., `qwen3`, `deepseek-r1`, `deepseek-v3`). You may skip it if the model has forced thinking enabled. +For reasoning models, add `--thinking-mode ` (e.g., `qwen3`, `deepseek-v3`). You may skip it if the model has forced thinking enabled. ``` **HumanEval** diff --git a/docs/developer_guide/setup_github_runner.md b/docs/developer_guide/setup_github_runner.md index 67d1b0be0439..6a0e92cdc4ec 100644 --- a/docs/developer_guide/setup_github_runner.md +++ b/docs/developer_guide/setup_github_runner.md @@ -1,4 +1,4 @@ -# Set Up Self-Hosted Runners for GitHub Action +# Set Up Self-Hosted Runners for GitHub Actions ## Add a Runner diff --git a/python/sglang/multimodal_gen/docs/cli.md b/docs/diffusion/api/cli.md similarity index 74% rename from python/sglang/multimodal_gen/docs/cli.md rename to docs/diffusion/api/cli.md index ae294e76fc2d..f0f8d7da4574 100644 --- a/python/sglang/multimodal_gen/docs/cli.md +++ b/docs/diffusion/api/cli.md @@ -5,7 +5,6 @@ The SGLang-diffusion CLI provides a quick way to access the inference pipeline f ## Prerequisites - A working SGLang diffusion installation and the `sglang` CLI available in `$PATH`. -- Python 3.11+ if you plan to use the OpenAI Python SDK. ## Supported Arguments @@ -13,7 +12,6 @@ The SGLang-diffusion CLI provides a quick way to access the inference pipeline f ### Server Arguments - `--model-path {MODEL_PATH}`: Path to the model or model ID -- `--vae-path {VAE_PATH}`: Path to a custom VAE model or HuggingFace model ID (e.g., `fal/FLUX.2-Tiny-AutoEncoder`). If not specified, the VAE will be loaded from the main model path. - `--lora-path {LORA_PATH}`: Path to a LoRA adapter (local path or HuggingFace model ID). If not specified, LoRA will not be applied. - `--lora-nickname {NAME}`: Nickname for the LoRA adapter. (default: `default`). - `--num-gpus {NUM_GPUS}`: Number of GPUs to use @@ -35,7 +33,7 @@ The SGLang-diffusion CLI provides a quick way to access the inference pipeline f - `--seed {SEED}`: Random seed for reproducible generation -#### Image/Video Configuration +**Image/Video Configuration** - `--height {HEIGHT}`: Height of the generated output - `--width {WIDTH}`: Width of the generated output @@ -43,7 +41,35 @@ The SGLang-diffusion CLI provides a quick way to access the inference pipeline f - `--fps {FPS}`: Frames per second for the saved output, if this is a video-generation task -#### Output Options +**Frame Interpolation** (video only) + +Frame interpolation is a post-processing step that synthesizes new frames +between each pair of consecutive generated frames, producing smoother +motion without re-running the diffusion model. The `--frame-interpolation-exp` +flag controls how many rounds of interpolation to apply: each round inserts one +new frame into every gap between adjacent frames, so the output frame count +follows the formula **(N − 1) × 2^exp + 1** (e.g. 5 original frames with +`exp=1` → 4 gaps × 1 new frame + 5 originals = **9** frames; with `exp=2` → +**17** frames). + +- `--enable-frame-interpolation`: Enable frame interpolation. Model weights are downloaded automatically on first use. +- `--frame-interpolation-exp {EXP}`: Interpolation exponent — `1` = 2× temporal resolution, `2` = 4×, etc. (default: `1`) +- `--frame-interpolation-scale {SCALE}`: RIFE inference scale; use `0.5` for high-resolution inputs to save memory (default: `1.0`) +- `--frame-interpolation-model-path {PATH}`: Local directory or HuggingFace repo ID containing RIFE `flownet.pkl` weights (default: `elfgum/RIFE-4.22.lite`, downloaded automatically) + +Example — generate a 5-frame video and interpolate to 9 frames ((5 − 1) × 2¹ + 1 = 9): + +```bash +sglang generate \ + --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \ + --prompt "A dog running through a park" \ + --num-frames 5 \ + --enable-frame-interpolation \ + --frame-interpolation-exp 1 \ + --save-output +``` + +**Output Options** - `--output-path {PATH}`: Directory to save the generated video - `--save-output`: Whether to save the image/video to disk @@ -168,7 +194,7 @@ When enabled, the server follows a **Generate -> Upload -> Delete** workflow: 3. Upon successful upload, the local file is deleted. 4. The API response returns the public URL of the uploaded object. -#### Configuration +**Configuration** Cloud storage is enabled via environment variables. Note that `boto3` must be installed separately (`pip install boto3`) to use this feature. @@ -183,7 +209,7 @@ export SGLANG_S3_SECRET_ACCESS_KEY=your-secret-key export SGLANG_S3_ENDPOINT_URL=https://minio.example.com ``` -See [Environment Variables Documentation](environment_variables.md) for more details. +See [Environment Variables Documentation](../environment_variables.md) for more details. ## Generate @@ -219,6 +245,32 @@ Once the generation task has finished, the server will shut down automatically. > [!NOTE] > The HTTP server-related arguments are ignored in this subcommand. +## Component Path Overrides + +SGLang diffusion allows you to override any pipeline component (e.g., `vae`, `transformer`, `text_encoder`) by specifying a custom checkpoint path. This is useful for: + +### Example: FLUX.2-dev with Tiny AutoEncoder + +You can override **any** component by using `---path`, where `` matches the key in the model's `model_index.json`: + +For example, replace the default VAE with a distilled tiny autoencoder for ~3x faster decoding: + +```bash +sglang serve \ + --model-path=black-forest-labs/FLUX.2-dev \ + # with a Huggingface Repo ID + --vae-path=fal/FLUX.2-Tiny-AutoEncoder + # or use a local path + --vae-path=~/.cache/huggingface/hub/models--fal--FLUX.2-Tiny-AutoEncoder/snapshots/.../vae +``` + +**Important:** +- The component key must match the one in your model's `model_index.json` (e.g., `vae`). +- The path must: + - either be a Huggingface Repo ID (e.g., fal/FLUX.2-Tiny-AutoEncoder) + - or point to a **complete component folder**, containing `config.json` and safetensors files + + ## Diffusers Backend SGLang diffusion supports a **diffusers backend** that allows you to run any diffusers-compatible model through SGLang's infrastructure using vanilla diffusers pipelines. This is useful for running models without native SGLang implementations or models with custom pipeline classes. @@ -234,6 +286,8 @@ SGLang diffusion supports a **diffusers backend** that allows you to run any dif | `--vae-slicing` | flag | Enable VAE slicing for lower memory usage (decodes slice-by-slice). | | `--dit-precision` | `fp16`, `bf16`, `fp32` | Precision for the diffusion transformer. | | `--vae-precision` | `fp16`, `bf16`, `fp32` | Precision for the VAE. | +| `--enable-torch-compile` | flag | Enable `torch.compile` for diffusers pipelines. | +| `--cache-dit-config` | `{PATH}` | Path to a Cache-DiT YAML/JSON config file for accelerating diffusers pipelines with Cache-DiT. | ### Example: Running Ovis-Image-7B @@ -272,3 +326,7 @@ For pipeline-specific parameters not exposed via CLI, use `diffusers_kwargs` in ```bash sglang generate --config config.json ``` + +### Cache-DiT Acceleration + +Users who use the diffusers backend can also leverage Cache-DiT acceleration and load custom cache configs from a YAML file to boost performance of diffusers pipelines. See the [Cache-DiT Acceleration](https://docs.sglang.io/diffusion/performance/cache/cache_dit.html) documentation for details. diff --git a/python/sglang/multimodal_gen/docs/openai_api.md b/docs/diffusion/api/openai_api.md similarity index 84% rename from python/sglang/multimodal_gen/docs/openai_api.md rename to docs/diffusion/api/openai_api.md index 88dabac4c69a..30d8d5312206 100644 --- a/python/sglang/multimodal_gen/docs/openai_api.md +++ b/docs/diffusion/api/openai_api.md @@ -2,6 +2,10 @@ The SGLang diffusion HTTP server implements an OpenAI-compatible API for image and video generation, as well as LoRA adapter management. +## Prerequisites + +- Python 3.11+ if you plan to use the OpenAI Python SDK. + ## Serve Launch the server using the `sglang serve` command. @@ -25,7 +29,7 @@ sglang serve "${SERVER_ARGS[@]}" - **--model-path**: Path to the model or model ID. - **--port**: HTTP port to listen on (default: `30000`). -#### Get Model Information +**Get Model Information** **Endpoint:** `GET /models` @@ -59,7 +63,7 @@ curl -sS -X GET "http://localhost:30010/models" The server implements an OpenAI-compatible Images API under the `/v1/images` namespace. -#### Create an image +**Create an image** **Endpoint:** `POST /v1/images/generations` @@ -98,9 +102,10 @@ curl -sS -X POST "http://localhost:30010/v1/images/generations" \ ``` > **Note** -> The `response_format=url` option is not supported for `POST /v1/images/generations` and will return a `400` error. +> If `response_format=url` is used and cloud storage is not configured, the API returns +> a relative URL like `/v1/images//content`. -#### Edit an image +**Edit an image** **Endpoint:** `POST /v1/images/edits` @@ -130,9 +135,10 @@ curl -sS -X POST "http://localhost:30010/v1/images/edits" \ -F "response_format=url" ``` -#### Download image content +**Download image content** -When `response_format=url` is used with `POST /v1/images/edits`, the API returns a relative URL like `/v1/images//content`. +When `response_format=url` is used with `POST /v1/images/generations` or `POST /v1/images/edits`, +the API returns a relative URL like `/v1/images//content`. **Endpoint:** `GET /v1/images/{image_id}/content` @@ -148,7 +154,7 @@ curl -sS -L "http://localhost:30010/v1/images//content" \ The server implements a subset of the OpenAI Videos API under the `/v1/videos` namespace. -#### Create a video +**Create a video** **Endpoint:** `POST /v1/videos` @@ -178,7 +184,7 @@ curl -sS -X POST "http://localhost:30010/v1/videos" \ }' ``` -#### List videos +**List videos** **Endpoint:** `GET /v1/videos` @@ -197,7 +203,7 @@ curl -sS -X GET "http://localhost:30010/v1/videos" \ -H "Authorization: Bearer sk-proj-1234567890" ``` -#### Download video content +**Download video content** **Endpoint:** `GET /v1/videos/{video_id}/content` @@ -239,7 +245,7 @@ The server supports dynamic loading, merging, and unmerging of LoRA adapters. - Switching: To switch LoRAs, you must first `unmerge` the current one, then `set` the new one - Caching: The server caches loaded LoRA weights in memory. Switching back to a previously loaded LoRA (same path) has little cost -#### Set LoRA Adapter +**Set LoRA Adapter** Loads one or more LoRA adapters and merges their weights into the model. Supports both single LoRA (backward compatible) and multiple LoRA adapters. @@ -301,7 +307,7 @@ curl -X POST http://localhost:30010/v1/set_lora \ > - Multiple LoRAs applied to the same target will be merged in order -#### Merge LoRA Weights +**Merge LoRA Weights** Manually merges the currently set LoRA weights into the base model. @@ -323,7 +329,7 @@ curl -X POST http://localhost:30010/v1/merge_lora_weights \ ``` -#### Unmerge LoRA Weights +**Unmerge LoRA Weights** Unmerges the currently active LoRA weights from the base model, restoring it to its original state. This **must** be called before setting a different LoRA. @@ -336,7 +342,7 @@ curl -X POST http://localhost:30010/v1/unmerge_lora_weights \ -H "Content-Type: application/json" ``` -#### List LoRA Adapters +**List LoRA Adapters** Returns loaded LoRA adapters and current application status per module. @@ -389,3 +395,26 @@ Notes: curl -X POST http://localhost:30010/v1/set_lora -d '{"lora_nickname": "lora_b", "lora_path": "path/to/B"}' ``` 5. Generate with LoRA B... + +### Adjust Output Quality + +The server supports adjusting output quality and compression levels for both image and video generation through the `output-quality` and `output-compression` parameters. + +#### Parameters + +- **`output-quality`** (string, optional): Preset quality level that automatically sets compression. **Default is `"default"`**. Valid values: + - `"maximum"`: Highest quality (100) + - `"high"`: High quality (90) + - `"medium"`: Medium quality (55) + - `"low"`: Lower quality (35) + - `"default"`: Auto-adjust based on media type (50 for video, 75 for image) + +- **`output-compression`** (integer, optional): Direct compression level override (0-100). **Default is `None`**. When provided (not `None`), takes precedence over `output-quality`. + - `0`: Lowest quality, smallest file size + - `100`: Highest quality, largest file size + +#### Notes + +- **Precedence**: When both `output-quality` and `output-compression` are provided, `output-compression` takes precedence +- **Format Support**: Quality settings apply to JPEG, and video formats. PNG uses lossless compression and ignores these settings +- **File Size vs Quality**: Lower compression values (or "low" quality preset) produce smaller files but may show visible artifacts diff --git a/python/sglang/multimodal_gen/docs/ci_perf.md b/docs/diffusion/ci_perf.md similarity index 96% rename from python/sglang/multimodal_gen/docs/ci_perf.md rename to docs/diffusion/ci_perf.md index fcedbc39c0c2..088c5be563bc 100644 --- a/python/sglang/multimodal_gen/docs/ci_perf.md +++ b/docs/diffusion/ci_perf.md @@ -1,5 +1,4 @@ - -## Perf baseline generation script +## Perf Baseline Generation Script `python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py` starts a local diffusion server, issues requests for selected test cases, aggregates stage/denoise-step/E2E timings from the perf log, and writes the results back to the `scenarios` section of `perf_baselines.json`. diff --git a/python/sglang/multimodal_gen/docs/support_matrix.md b/docs/diffusion/compatibility_matrix.md similarity index 82% rename from python/sglang/multimodal_gen/docs/support_matrix.md rename to docs/diffusion/compatibility_matrix.md index eb06afc4adc5..392f3d9b98fc 100644 --- a/python/sglang/multimodal_gen/docs/support_matrix.md +++ b/docs/diffusion/compatibility_matrix.md @@ -16,26 +16,26 @@ default parameters when initializing and generating videos. ### Video Generation Models -| Model Name | Hugging Face Model ID | Resolutions | TeaCache | Sliding Tile Attn | Sage Attn | Video Sparse Attention (VSA) | Sparse Linear Attention(SLA)| Sage Sparse Linear Attention(SageSLA)| -|:-----------------------------|:--------------------------------------------------|:--------------------|:--------:|:-----------------:|:---------:|:----------------------------:|:----------------------------:|:-----------------------------------------------:| -| FastWan2.1 T2V 1.3B | `FastVideo/FastWan2.1-T2V-1.3B-Diffusers` | 480p | ⭕ | ⭕ | ⭕ | ✅ | ❌ | ❌ | -| FastWan2.2 TI2V 5B Full Attn | `FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers` | 720p | ⭕ | ⭕ | ⭕ | ✅ | ❌ | ❌ | -| Wan2.2 TI2V 5B | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | 720p | ⭕ | ⭕ | ✅ | ⭕ | ❌ | ❌ | -| Wan2.2 T2V A14B | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | 480p
720p | ❌ | ❌ | ✅ | ⭕ | ❌ | ❌ | -| Wan2.2 I2V A14B | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | 480p
720p | ❌ | ❌ | ✅ | ⭕ | ❌ | ❌ | -| HunyuanVideo | `hunyuanvideo-community/HunyuanVideo` | 720×1280
544×960 | ❌ | ✅ | ✅ | ⭕ | ❌ | ❌ | -| FastHunyuan | `FastVideo/FastHunyuan-diffusers` | 720×1280
544×960 | ❌ | ✅ | ✅ | ⭕ | ❌ | ❌ | -| Wan2.1 T2V 1.3B | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | 480p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | -| Wan2.1 T2V 14B | `Wan-AI/Wan2.1-T2V-14B-Diffusers` | 480p, 720p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | -| Wan2.1 I2V 480P | `Wan-AI/Wan2.1-I2V-14B-480P-Diffusers` | 480p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | -| Wan2.1 I2V 720P | `Wan-AI/Wan2.1-I2V-14B-720P-Diffusers` | 720p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | -| TurboWan2.1 T2V 1.3B | `IPostYellow/TurboWan2.1-T2V-1.3B-Diffusers` | 480p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | -| TurboWan2.1 T2V 14B | `IPostYellow/TurboWan2.1-T2V-14B-Diffusers` | 480p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | -| TurboWan2.1 T2V 14B 720P | `IPostYellow/TurboWan2.1-T2V-14B-720P-Diffusers` | 720p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | -| TurboWan2.2 I2V A14B | `IPostYellow/TurboWan2.2-I2V-A14B-Diffusers` | 720p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | - -**Note**:
-1.Wan2.2 TI2V 5B has some quality issues when performing I2V generation. We are working on fixing this issue.
+| Model Name | Hugging Face Model ID | Resolutions | TeaCache | Sliding Tile Attn | Sage Attn | Video Sparse Attention (VSA) | Sparse Linear Attention (SLA) | Sage Sparse Linear Attention (SageSLA) | Sparse Video Gen 2 (SVG2) | +|:-----------------------------|:--------------------------------------------------|:--------------------|:--------:|:-----------------:|:---------:|:----------------------------:|:----------------------------:|:-----------------------------------------------:|:----------------------------------:| +| FastWan2.1 T2V 1.3B | `FastVideo/FastWan2.1-T2V-1.3B-Diffusers` | 480p | ⭕ | ⭕ | ⭕ | ✅ | ❌ | ❌ | ❌ | +| FastWan2.2 TI2V 5B Full Attn | `FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers` | 720p | ⭕ | ⭕ | ⭕ | ✅ | ❌ | ❌ | ❌ | +| Wan2.2 TI2V 5B | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | 720p | ⭕ | ⭕ | ✅ | ⭕ | ❌ | ❌ | ❌ | +| Wan2.2 T2V A14B | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | 480p
720p | ❌ | ❌ | ✅ | ⭕ | ❌ | ❌ | ❌ | +| Wan2.2 I2V A14B | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | 480p
720p | ❌ | ❌ | ✅ | ⭕ | ❌ | ❌ | ❌ | +| HunyuanVideo | `hunyuanvideo-community/HunyuanVideo` | 720×1280
544×960 | ❌ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ | +| FastHunyuan | `FastVideo/FastHunyuan-diffusers` | 720×1280
544×960 | ❌ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ | +| Wan2.1 T2V 1.3B | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | 480p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ | +| Wan2.1 T2V 14B | `Wan-AI/Wan2.1-T2V-14B-Diffusers` | 480p, 720p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ | +| Wan2.1 I2V 480P | `Wan-AI/Wan2.1-I2V-14B-480P-Diffusers` | 480p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ | +| Wan2.1 I2V 720P | `Wan-AI/Wan2.1-I2V-14B-720P-Diffusers` | 720p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ | +| TurboWan2.1 T2V 1.3B | `IPostYellow/TurboWan2.1-T2V-1.3B-Diffusers` | 480p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ⭕ | +| TurboWan2.1 T2V 14B | `IPostYellow/TurboWan2.1-T2V-14B-Diffusers` | 480p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ⭕ | +| TurboWan2.1 T2V 14B 720P | `IPostYellow/TurboWan2.1-T2V-14B-720P-Diffusers` | 720p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ⭕ | +| TurboWan2.2 I2V A14B | `IPostYellow/TurboWan2.2-I2V-A14B-Diffusers` | 720p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ⭕ | + +**Note**: +1.Wan2.2 TI2V 5B has some quality issues when performing I2V generation. We are working on fixing this issue. 2.SageSLA Based on SpargeAttn. Install it first with `pip install git+https://github.com/thu-ml/SpargeAttn.git --no-build-isolation` ### Image Generation Models @@ -55,7 +55,7 @@ default parameters when initializing and generating videos. This section lists example LoRAs that have been explicitly tested and verified with each base model in the **SGLang Diffusion** pipeline. -> Important: \ +> Important: > LoRAs that are not listed here are not necessarily incompatible. > In practice, most standard LoRAs are expected to work, especially those following common Diffusers or SD-style conventions. > The entries below simply reflect configurations that have been manually validated by the SGLang team. diff --git a/python/sglang/multimodal_gen/docs/contributing.md b/docs/diffusion/contributing.md similarity index 80% rename from python/sglang/multimodal_gen/docs/contributing.md rename to docs/diffusion/contributing.md index 78330c2ba497..7de656100e16 100644 --- a/python/sglang/multimodal_gen/docs/contributing.md +++ b/docs/diffusion/contributing.md @@ -2,7 +2,15 @@ This guide outlines the requirements for contributing to the SGLang Diffusion module (`sglang.multimodal_gen`). -## 1. Commit Message Convention +## On AI-Assisted ("Vibe Coding") PRs + +Vibe-coded PRs are welcome — we judge code quality, not how it was produced. The bar is the same for all PRs: + +- **No over-commenting.** If the name says it all, skip the docstring. +- **No over-catching.** Don't guard against errors that virtually never happen in practice. +- **Test before submitting.** AI-generated code can be subtly wrong — verify correctness end-to-end. + +## Commit Message Convention We follow a structured commit message format to maintain a clean history. @@ -21,7 +29,7 @@ We follow a structured commit message format to maintain a clean history. - **Scope** (Optional): `cli`, `scheduler`, `model`, `pipeline`, `docs`, etc. - **Subject**: Imperative mood, short and clear (e.g., "add feature" not "added feature"). -## 2. Performance Reporting +## Performance Reporting For PRs that impact **latency**, **throughput**, or **memory usage**, you **should** provide a performance comparison report. @@ -45,7 +53,7 @@ For PRs that impact **latency**, **throughput**, or **memory usage**, you **shou ``` 4. **Paste**: paste the table into the PR description -## 3. CI-Based Change Protection +## CI-Based Change Protection Consider adding tests to the `pr-test` or `nightly-test` suites to safeguard your changes, especially for PRs that: diff --git a/python/sglang/multimodal_gen/docs/environment_variables.md b/docs/diffusion/environment_variables.md similarity index 94% rename from python/sglang/multimodal_gen/docs/environment_variables.md rename to docs/diffusion/environment_variables.md index 2c07a3aec5ce..b02d7beb749b 100644 --- a/python/sglang/multimodal_gen/docs/environment_variables.md +++ b/docs/diffusion/environment_variables.md @@ -1,11 +1,11 @@ ## Caching Acceleration These variables configure caching acceleration for Diffusion Transformer (DiT) models. -SGLang supports multiple caching strategies - see [caching documentation](cache/caching.md) for an overview. +SGLang supports multiple caching strategies - see [caching documentation](performance/cache/index.md) for an overview. ### Cache-DiT Configuration -See [cache-dit documentation](cache/cache_dit.md) for detailed configuration. +See [cache-dit documentation](performance/cache/cache_dit.md) for detailed configuration. | Environment Variable | Default | Description | |-------------------------------------|---------|------------------------------------------| diff --git a/docs/diffusion/index.md b/docs/diffusion/index.md new file mode 100644 index 000000000000..3a1aa815a462 --- /dev/null +++ b/docs/diffusion/index.md @@ -0,0 +1,98 @@ +# SGLang Diffusion + +SGLang Diffusion is an inference framework for accelerated image and video generation using diffusion models. It provides an end-to-end unified pipeline with optimized kernels and an efficient scheduler loop. + +## Key Features + +- **Broad Model Support**: Wan series, FastWan series, Hunyuan, Qwen-Image, Qwen-Image-Edit, Flux, Z-Image, GLM-Image, and more +- **Fast Inference**: Optimized kernels, efficient scheduler loop, and Cache-DiT acceleration +- **Ease of Use**: OpenAI-compatible API, CLI, and Python SDK +- **Multi-Platform**: NVIDIA GPUs (H100, H200, A100, B200, 4090), AMD GPUs (MI300X, MI325X) and Ascend NPU (A2, A3) + +--- + +## Quick Start + +### Installation + +```bash +uv pip install "sglang[diffusion]" --prerelease=allow +``` + +See [Installation Guide](installation.md) for more installation methods and ROCm-specific instructions. + +### Basic Usage + +Generate an image with the CLI: + +```bash +sglang generate --model-path Qwen/Qwen-Image \ + --prompt "A beautiful sunset over the mountains" \ + --save-output +``` + +Or start a server with the OpenAI-compatible API: + +```bash +sglang serve --model-path Qwen/Qwen-Image --port 30010 +``` + +--- + +## Documentation + +### Getting Started + +- **[Installation](installation.md)** - Install SGLang Diffusion via pip, uv, Docker, or from source +- **[Compatibility Matrix](compatibility_matrix.md)** - Supported models and optimization compatibility + +### Usage + +- **[CLI Documentation](api/cli.md)** - Command-line interface for `sglang generate` and `sglang serve` +- **[OpenAI API](api/openai_api.md)** - OpenAI-compatible API for image/video generation and LoRA management + +### Performance Optimization + +- **[Performance Overview](performance/index.md)** - Overview of all performance optimization strategies +- **[Attention Backends](performance/attention_backends.md)** - Available attention backends (FlashAttention, SageAttention, etc.) +- **[Caching Strategies](performance/cache/)** - Cache-DiT and TeaCache acceleration +- **[Profiling](performance/profiling.md)** - Profiling techniques with PyTorch Profiler and Nsight Systems + +### Reference + +- **[Environment Variables](environment_variables.md)** - Configuration via environment variables +- **[Support New Models](support_new_models.md)** - Guide for adding new diffusion models +- **[Contributing](contributing.md)** - Contribution guidelines and commit message conventions +- **[CI Performance](ci_perf.md)** - Performance baseline generation script + +--- + +## CLI Quick Reference + +### Generate (one-off generation) + +```bash +sglang generate --model-path --prompt "" --save-output +``` + +### Serve (HTTP server) + +```bash +sglang serve --model-path --port 30010 +``` + +### Enable Cache-DiT acceleration + +```bash +SGLANG_CACHE_DIT_ENABLED=true sglang generate --model-path --prompt "" +``` + +--- + +## References + +- [SGLang GitHub](https://github.com/sgl-project/sglang) +- [Cache-DiT](https://github.com/vipshop/cache-dit) +- [FastVideo](https://github.com/hao-ai-lab/FastVideo) +- [xDiT](https://github.com/xdit-project/xDiT) +- [Diffusers](https://github.com/huggingface/diffusers) diff --git a/docs/diffusion/installation.md b/docs/diffusion/installation.md new file mode 100644 index 000000000000..4cd62b10a9d2 --- /dev/null +++ b/docs/diffusion/installation.md @@ -0,0 +1,95 @@ +# Install SGLang-Diffusion + +You can install SGLang-Diffusion using one of the methods below. + +## Standard Installation (NVIDIA GPUs) + +### Method 1: With pip or uv + +It is recommended to use uv for a faster installation: + +```bash +pip install --upgrade pip +pip install uv +uv pip install "sglang[diffusion]" --prerelease=allow +``` + +### Method 2: From source + +```bash +# Use the latest release branch +git clone https://github.com/sgl-project/sglang.git +cd sglang + +# Install the Python packages +pip install --upgrade pip +pip install -e "python[diffusion]" + +# With uv +uv pip install -e "python[diffusion]" --prerelease=allow +``` + +### Method 3: Using Docker + +The Docker images are available on Docker Hub at [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang), built from the [Dockerfile](https://github.com/sgl-project/sglang/blob/main/docker/Dockerfile). +Replace `` below with your HuggingFace Hub [token](https://huggingface.co/docs/hub/en/security-tokens). + +```bash +docker run --gpus all \ + --shm-size 32g \ + -p 30000:30000 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HF_TOKEN=" \ + --ipc=host \ + lmsysorg/sglang:dev \ + zsh -c '\ + echo "Installing diffusion dependencies..." && \ + pip install -e "python[diffusion]" && \ + echo "Starting SGLang-Diffusion..." && \ + sglang generate \ + --model-path black-forest-labs/FLUX.1-dev \ + --prompt "A logo With Bold Large text: SGL Diffusion" \ + --save-output \ + ' +``` + +## Platform-Specific: ROCm (AMD GPUs) + +For AMD Instinct GPUs (e.g., MI300X), you can use the ROCm-enabled Docker image: + +```bash +docker run --device=/dev/kfd --device=/dev/dri --ipc=host \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env HF_TOKEN= \ + lmsysorg/sglang:v0.5.5.post2-rocm700-mi30x \ + sglang generate --model-path black-forest-labs/FLUX.1-dev --prompt "A logo With Bold Large text: SGL Diffusion" --save-output +``` + +For detailed ROCm system configuration and installation from source, see [AMD GPUs](../../platforms/amd_gpu.md). + +## Platform-Specific: MUSA (Moore Threads GPUs) + +For Moore Threads GPUs (MTGPU) with the MUSA software stack: + +```bash +# Clone the repository +git clone https://github.com/sgl-project/sglang.git +cd sglang + +# Install the Python packages +pip install --upgrade pip +rm -f python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml +pip install -e "python[all_musa]" +``` + +## Platform-Specific: Ascend NPU + +For Ascend NPU, please follow the [NPU installation guide](../platforms/ascend_npu.md). + +Quick test: + +```bash +sglang generate --model-path black-forest-labs/FLUX.1-dev \ + --prompt "A logo With Bold Large text: SGL Diffusion" \ + --save-output +``` diff --git a/python/sglang/multimodal_gen/docs/attention_backends.md b/docs/diffusion/performance/attention_backends.md similarity index 80% rename from python/sglang/multimodal_gen/docs/attention_backends.md rename to docs/diffusion/performance/attention_backends.md index 6b2f85c07c91..9113d5bb15bb 100644 --- a/python/sglang/multimodal_gen/docs/attention_backends.md +++ b/docs/diffusion/performance/attention_backends.md @@ -14,6 +14,7 @@ When using the diffusers backend, `--attention-backend` is passed through to dif - **CUDA**: prefers FlashAttention (FA3/FA4) when supported; otherwise falls back to PyTorch SDPA. - **ROCm**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA. - **MPS**: always uses PyTorch SDPA. +- **NPU**: always uses PyTorch SDPA. ## Backend options @@ -29,6 +30,7 @@ For SGLang-native pipelines, the CLI accepts the lowercase names of `AttentionBa | `video_sparse_attn` | `VIDEO_SPARSE_ATTN` | Requires `vsa`. Configure `sparsity` via `--attention-backend-config`. | | `vmoba_attn` | `VMOBA_ATTN` | Requires `kernel.attn.vmoba_attn.vmoba`. Configure via `--attention-backend-config`. | | `aiter` | `AITER` | Requires `aiter`. | +| `sparse_video_gen_2_attn` | `SPARSE_VIDEO_GEN_2_ATTN` | Requires `svg`. See installation instructions at https://github.com/svg-project/Sparse-VideoGen. | ## Selection priority @@ -47,7 +49,7 @@ Some backends require additional configuration. You can pass these parameters vi ### Supported Configuration Parameters -#### Sliding Tile Attention (`sliding_tile_attn`) +**Sliding Tile Attention (`sliding_tile_attn`)** | Parameter | Type | Description | Default | | :--- | :--- | :--- | :--- | @@ -55,13 +57,13 @@ Some backends require additional configuration. You can pass these parameters vi | `sta_mode` | `str` | Mode of STA. | `STA_inference` | | `skip_time_steps` | `int` | Number of steps to use full attention before switching to sparse attention. | `15` | -#### Video Sparse Attention (`video_sparse_attn`) +**Video Sparse Attention (`video_sparse_attn`)** | Parameter | Type | Description | Default | | :--- | :--- | :--- | :--- | | `sparsity` | `float` | Validation sparsity (0.0 - 1.0). | `0.0` | -#### V-MoBA (`vmoba_attn`) +**V-MoBA (`vmoba_attn`)** | Parameter | Type | Description | Default | | :--- | :--- | :--- | :--- | @@ -82,16 +84,17 @@ Some backends require additional configuration. You can pass these parameters vi ## Platform support matrix -| Backend | CUDA | ROCm | MPS | Notes | -|---|---:|---:|---:|---| -| `fa` | ✅ | ✅ | ❌ | CUDA requires SM80+ and fp16/bf16. FlashAttention is only used when the required runtime is installed; otherwise it falls back to `torch_sdpa`. | -| `torch_sdpa` | ✅ | ✅ | ✅ | Most compatible option across platforms. | -| `sliding_tile_attn` | ✅ | ❌ | ❌ | CUDA-only. Requires `st_attn`. Configure via `--attention-backend-config`. | -| `sage_attn` | ✅ | ❌ | ❌ | CUDA-only (optional dependency). | -| `sage_attn_3` | ✅ | ❌ | ❌ | CUDA-only (optional dependency). | -| `video_sparse_attn` | ✅ | ❌ | ❌ | CUDA-only. Requires `vsa`. Configure `sparsity` via `--attention-backend-config`. | -| `vmoba_attn` | ✅ | ❌ | ❌ | CUDA-only. Requires `kernel.attn.vmoba_attn.vmoba`. Configure via `--attention-backend-config`. | -| `aiter` | ✅ | ❌ | ❌ | Requires `aiter`. | +| Backend | CUDA | ROCm | MPS | NPU | Notes | +|---|---:|---:|---:|---:|---| +| `fa` | ✅ | ✅ | ❌ | ❌ | CUDA requires SM80+ and fp16/bf16. FlashAttention is only used when the required runtime is installed; otherwise it falls back to `torch_sdpa`. | +| `torch_sdpa` | ✅ | ✅ | ✅ | ✅ | Most compatible option across platforms. | +| `sliding_tile_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `st_attn`. Configure via `--attention-backend-config`. | +| `sage_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only (optional dependency). | +| `sage_attn_3` | ✅ | ❌ | ❌ | ❌ | CUDA-only (optional dependency). | +| `video_sparse_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `vsa`. Configure `sparsity` via `--attention-backend-config`. | +| `vmoba_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `kernel.attn.vmoba_attn.vmoba`. Configure via `--attention-backend-config`. | +| `aiter` | ✅ | ❌ | ❌ | ❌ | Requires `aiter`. | +| `sparse_video_gen_2_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `svg`. | ## Usage diff --git a/python/sglang/multimodal_gen/docs/cache/cache_dit.md b/docs/diffusion/performance/cache/cache_dit.md similarity index 71% rename from python/sglang/multimodal_gen/docs/cache/cache_dit.md rename to docs/diffusion/performance/cache/cache_dit.md index 9e0a0f66a7a9..8d4ce599fe23 100644 --- a/python/sglang/multimodal_gen/docs/cache/cache_dit.md +++ b/docs/diffusion/performance/cache/cache_dit.md @@ -1,9 +1,5 @@ # Cache-DiT Acceleration -> **Note**: This is one of two caching strategies available in SGLang. -> For an overview of all caching options, see [caching.md](caching.md). -> For TeaCache documentation, see [teacache.md](teacache.md). - SGLang integrates [Cache-DiT](https://github.com/vipshop/cache-dit), a caching acceleration engine for Diffusion Transformers (DiT), to achieve up to **1.69x inference speedup** with minimal quality loss. ## Overview @@ -24,15 +20,15 @@ sglang generate --model-path Qwen/Qwen-Image \ --prompt "A beautiful sunset over the mountains" ``` -## Diffusers Backend Configuration +## Diffusers Backend Cache-DiT supports loading acceleration configs from a custom YAML file. For -diffusers pipelines, pass the YAML/JSON path via `--cache-dit-config`. This +diffusers pipelines (`diffusers` backend), pass the YAML/JSON path via `--cache-dit-config`. This flow requires cache-dit >= 1.2.0 (`cache_dit.load_configs`). ### Single GPU inference -Define a `config.yaml` file that contains: +Define a `cache.yaml` file that contains: ```yaml cache_config: @@ -50,15 +46,72 @@ cache_config: Then apply the config with: ```bash -sglang generate --backend diffusers \ +sglang generate \ + --backend diffusers \ --model-path Qwen/Qwen-Image \ - --cache-dit-config config.yaml \ + --cache-dit-config cache.yaml \ --prompt "A beautiful sunset over the mountains" ``` ### Distributed inference -Define a `parallel_config.yaml` file that contains: +- 1D Parallelism + +Define a parallelism only config yaml `parallel.yaml` file that contains: + +```yaml +parallelism_config: + ulysses_size: auto + parallel_kwargs: + attention_backend: native + extra_parallel_modules: ["text_encoder", "vae"] +``` + +Then, apply the distributed inference acceleration config from yaml. `ulysses_size: auto` means that cache-dit will auto detect the `world_size` as the ulysses_size. Otherwise, you should manually set it as specific int number, e.g, 4. + +Then apply the distributed config with: (Note: please add `--num-gpus N` to specify the number of gpus for distributed inference) + +```bash +sglang generate \ + --backend diffusers \ + --num-gpus 4 \ + --model-path Qwen/Qwen-Image \ + --cache-dit-config parallel.yaml \ + --prompt "A futuristic cityscape at sunset" +``` + +- 2D Parallelism + +You can also define a 2D parallelism config yaml `parallel_2d.yaml` file that contains: + +```yaml +parallelism_config: + ulysses_size: auto + tp_size: 2 + parallel_kwargs: + attention_backend: native + extra_parallel_modules: ["text_encoder", "vae"] +``` +Then, apply the 2D parallelism config from yaml. Here `tp_size: 2` means using tensor parallelism with size 2. The `ulysses_size: auto` means that cache-dit will auto detect the `world_size // tp_size` as the ulysses_size. + +- 3D Parallelism + +You can also define a 3D parallelism config yaml `parallel_3d.yaml` file that contains: + +```yaml +parallelism_config: + ulysses_size: 2 + ring_size: 2 + tp_size: 2 + parallel_kwargs: + attention_backend: native + extra_parallel_modules: ["text_encoder", "vae"] +``` +Then, apply the 3D parallelism config from yaml. Here `ulysses_size: 2`, `ring_size: 2`, `tp_size: 2` means using ulysses parallelism with size 2, ring parallelism with size 2 and tensor parallelism with size 2. + +### Hybrid Cache and Parallelism + +Define a hybrid cache and parallel acceleration config yaml `hybrid.yaml` file that contains: ```yaml cache_config: @@ -78,16 +131,15 @@ parallelism_config: extra_parallel_modules: ["text_encoder", "vae"] ``` -`ulysses_size: auto` means cache-dit will auto-detect the world_size. Otherwise, -set it to a specific integer (e.g., `4`). - -Then apply the distributed config with: +Then, apply the hybrid cache and parallel acceleration config from yaml. ```bash -sglang generate --backend diffusers \ +sglang generate \ + --backend diffusers \ + --num-gpus 4 \ --model-path Qwen/Qwen-Image \ - --cache-dit-config parallel_config.yaml \ - --prompt "A futuristic cityscape at sunset" + --cache-dit-config hybrid.yaml \ + --prompt "A beautiful sunset over the mountains" ``` ## Advanced Configuration @@ -136,7 +188,7 @@ sglang generate --model-path black-forest-labs/FLUX.1-dev \ SCM provides step-level caching control for additional speedup. It decides which denoising steps to compute fully and which to use cached results. -#### SCM Presets +**SCM Presets** SCM is configured with presets: @@ -148,7 +200,7 @@ SCM is configured with presets: | `fast` | ~35% | ~3x | Acceptable | | `ultra` | ~25% | ~4x | Lower | -##### Usage +**Usage** ```bash SGLANG_CACHE_DIT_ENABLED=true \ @@ -157,7 +209,7 @@ sglang generate --model-path Qwen/Qwen-Image \ --prompt "A futuristic cityscape at sunset" ``` -#### Custom SCM Bins +**Custom SCM Bins** For fine-grained control over which steps to compute vs cache: @@ -169,7 +221,7 @@ sglang generate --model-path Qwen/Qwen-Image \ --prompt "A futuristic cityscape at sunset" ``` -#### SCM Policy +**SCM Policy** | Policy | Env Variable | Description | |-----------|---------------------------------------|---------------------------------------------| @@ -178,22 +230,8 @@ sglang generate --model-path Qwen/Qwen-Image \ ## Environment Variables -All Cache-DiT parameters can be set via the following environment variables: - -| Environment Variable | Default | Description | -|-------------------------------------|---------|------------------------------------------| -| `SGLANG_CACHE_DIT_ENABLED` | false | Enable Cache-DiT acceleration | -| `SGLANG_CACHE_DIT_FN` | 1 | First N blocks to always compute | -| `SGLANG_CACHE_DIT_BN` | 0 | Last N blocks to always compute | -| `SGLANG_CACHE_DIT_WARMUP` | 4 | Warmup steps before caching | -| `SGLANG_CACHE_DIT_RDT` | 0.24 | Residual difference threshold | -| `SGLANG_CACHE_DIT_MC` | 3 | Max continuous cached steps | -| `SGLANG_CACHE_DIT_TAYLORSEER` | false | Enable TaylorSeer calibrator | -| `SGLANG_CACHE_DIT_TS_ORDER` | 1 | TaylorSeer order (1 or 2) | -| `SGLANG_CACHE_DIT_SCM_PRESET` | none | SCM preset (none/slow/medium/fast/ultra) | -| `SGLANG_CACHE_DIT_SCM_POLICY` | dynamic | SCM caching policy | -| `SGLANG_CACHE_DIT_SCM_COMPUTE_BINS` | not set | Custom SCM compute bins | -| `SGLANG_CACHE_DIT_SCM_CACHE_BINS` | not set | Custom SCM cache bins | +All Cache-DiT parameters can be configured via environment variables. +See [Environment Variables](../../environment_variables.md) for the complete list. ## Supported Models @@ -224,14 +262,6 @@ SGLang Diffusion x Cache-DiT supports almost all models originally supported in ## Troubleshooting -### Distributed environment warning - -``` -WARNING: cache-dit is disabled in distributed environment (world_size=N) -``` - -This is expected behavior. Cache-DiT currently only supports single-GPU inference. - ### SCM disabled for low step count For models with < 8 inference steps (e.g., DMD distilled models), SCM will be automatically disabled. DBCache @@ -239,5 +269,5 @@ acceleration still works. ## References -- [Cache-Dit](https://github.com/vipshop/cache-dit) -- [SGLang Diffusion](../README.md) +- [Cache-DiT](https://github.com/vipshop/cache-dit) +- [SGLang Diffusion](../index.md) diff --git a/python/sglang/multimodal_gen/docs/cache/caching.md b/docs/diffusion/performance/cache/index.md similarity index 100% rename from python/sglang/multimodal_gen/docs/cache/caching.md rename to docs/diffusion/performance/cache/index.md diff --git a/python/sglang/multimodal_gen/docs/cache/teacache.md b/docs/diffusion/performance/cache/teacache.md similarity index 97% rename from python/sglang/multimodal_gen/docs/cache/teacache.md rename to docs/diffusion/performance/cache/teacache.md index 5eb0b6c19bdd..7960437c7b68 100644 --- a/python/sglang/multimodal_gen/docs/cache/teacache.md +++ b/docs/diffusion/performance/cache/teacache.md @@ -1,7 +1,7 @@ # TeaCache Acceleration > **Note**: This is one of two caching strategies available in SGLang. -> For an overview of all caching options, see [caching.md](caching.md). +> For an overview of all caching options, see [caching](../index.md). TeaCache (Temporal similarity-based caching) accelerates diffusion inference by detecting when consecutive denoising steps are similar enough to skip computation entirely. diff --git a/docs/diffusion/performance/index.md b/docs/diffusion/performance/index.md new file mode 100644 index 000000000000..f61c4e93c17a --- /dev/null +++ b/docs/diffusion/performance/index.md @@ -0,0 +1,72 @@ +# Performance Optimization + +SGLang-Diffusion provides multiple performance optimization strategies to accelerate inference. This section covers all available performance tuning options. + +## Overview + +| Optimization | Type | Description | +|--------------|------|-------------| +| **Cache-DiT** | Caching | Block-level caching with DBCache, TaylorSeer, and SCM | +| **TeaCache** | Caching | Timestep-level caching using L1 similarity | +| **Attention Backends** | Kernel | Optimized attention implementations (FlashAttention, SageAttention, etc.) | +| **Profiling** | Diagnostics | PyTorch Profiler and Nsight Systems guidance | + +## Caching Strategies + +SGLang supports two complementary caching approaches: + +### Cache-DiT + +[Cache-DiT](https://github.com/vipshop/cache-dit) provides block-level caching with advanced strategies. It can achieve up to **1.69x speedup**. + +**Quick Start:** +```bash +SGLANG_CACHE_DIT_ENABLED=true \ +sglang generate --model-path Qwen/Qwen-Image \ + --prompt "A beautiful sunset over the mountains" +``` + +**Key Features:** +- **DBCache**: Dynamic block-level caching based on residual differences +- **TaylorSeer**: Taylor expansion-based calibration for optimized caching +- **SCM**: Step-level computation masking for additional speedup + +See [Cache-DiT Documentation](cache/cache_dit.md) for detailed configuration. + +### TeaCache + +TeaCache (Temporal similarity-based caching) accelerates diffusion inference by detecting when consecutive denoising steps are similar enough to skip computation entirely. + +**Quick Overview:** +- Tracks L1 distance between modulated inputs across timesteps +- When accumulated distance is below threshold, reuses cached residual +- Supports CFG with separate positive/negative caches + +**Supported Models:** Wan (wan2.1, wan2.2), Hunyuan (HunyuanVideo), Z-Image + +See [TeaCache Documentation](cache/teacache.md) for detailed configuration. + +## Attention Backends + +Different attention backends offer varying performance characteristics depending on your hardware and model: + +- **FlashAttention**: Fastest on NVIDIA GPUs with fp16/bf16 +- **SageAttention**: Alternative optimized implementation +- **xformers**: Memory-efficient attention +- **SDPA**: PyTorch native scaled dot-product attention + +See [Attention Backends](attention_backends.md) for platform support and configuration options. + +## Profiling + +To diagnose performance bottlenecks, SGLang-Diffusion supports profiling tools: + +- **PyTorch Profiler**: Built-in Python profiling +- **Nsight Systems**: GPU kernel-level analysis + +See [Profiling Guide](profiling.md) for detailed instructions. + +## References + +- [Cache-DiT Repository](https://github.com/vipshop/cache-dit) +- [TeaCache Paper](https://arxiv.org/abs/2411.14324) diff --git a/python/sglang/multimodal_gen/docs/profiling.md b/docs/diffusion/performance/profiling.md similarity index 100% rename from python/sglang/multimodal_gen/docs/profiling.md rename to docs/diffusion/performance/profiling.md diff --git a/python/sglang/multimodal_gen/docs/support_new_models.md b/docs/diffusion/support_new_models.md similarity index 88% rename from python/sglang/multimodal_gen/docs/support_new_models.md rename to docs/diffusion/support_new_models.md index e51bd68d7b10..3141d5affbfb 100644 --- a/python/sglang/multimodal_gen/docs/support_new_models.md +++ b/docs/diffusion/support_new_models.md @@ -23,7 +23,7 @@ To add support for a new diffusion model, you will primarily need to define or c 3. **`ComposedPipeline` (not a config)**: This is the central class where you define the structure of your model's generation pipeline. You will create a new class that inherits from `ComposedPipelineBase` and, within it, instantiate and chain together the necessary `PipelineStage`s in the correct order. See `ComposedPipelineBase` and `PipelineStage` base definitions: - [`ComposedPipelineBase`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/pipelines/composed_pipeline_base.py) - - [`PipelineStage`]( https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/pipelines/stages/base.py) + - [`PipelineStage`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/pipelines/stages/base.py) - [Central registry (models/config mapping)](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/registry.py) 4. **Modules (components referenced by the pipeline)**: Each pipeline references a set of modules that are loaded from the model repository (e.g., Diffusers `model_index.json`) and assembled via the registry/loader. Common modules include: @@ -37,7 +37,7 @@ To add support for a new diffusion model, you will primarily need to define or c ## Available Pipeline Stages -You can build your custom `ComposedPipeline` by combining the following available stages as your will. Each stage is responsible for a specific part of the generation process. +You can build your custom `ComposedPipeline` by combining the following available stages as needed. Each stage is responsible for a specific part of the generation process. | Stage Class | Description | | -------------------------------- | ------------------------------------------------------------------------------------------------------- | @@ -45,7 +45,6 @@ You can build your custom `ComposedPipeline` by combining the following availabl | `TextEncodingStage` | Encodes text prompts into embeddings using one or more text encoders. | | `ImageEncodingStage` | Encodes input images into embeddings, often used in image-to-image tasks. | | `ImageVAEEncodingStage` | Specifically encodes an input image into the latent space using a Variational Autoencoder (VAE). | -| `ConditioningStage` | Prepares the conditioning tensors (e.g., from text or image embeddings) for the denoising loop. | | `TimestepPreparationStage` | Prepares the scheduler's timesteps for the diffusion process. | | `LatentPreparationStage` | Creates the initial noisy latent tensor that will be denoised. | | `DenoisingStage` | Executes the main denoising loop, iteratively applying the model (e.g., UNet) to refine the latents. | @@ -88,15 +87,13 @@ To illustrate the process, let's look at how `Qwen-Image-Edit` is implemented. T _required_config_modules = ["processor", "scheduler", "text_encoder", "tokenizer", "transformer", "vae"] def create_pipeline_stages(self, server_args: ServerArgs): - """Set up pipeline stages sequentially.""" - self.add_stage(stage_name="input_validation_stage", stage=InputValidationStage()) - self.add_stage(stage_name="prompt_encoding_stage_primary", stage=ImageEncodingStage(...)) - self.add_stage(stage_name="image_encoding_stage_primary", stage=ImageVAEEncodingStage(...)) - self.add_stage(stage_name="timestep_preparation_stage", stage=TimestepPreparationStage(...)) - self.add_stage(stage_name="latent_preparation_stage", stage=LatentPreparationStage(...)) - self.add_stage(stage_name="conditioning_stage", stage=ConditioningStage()) - self.add_stage(stage_name="denoising_stage", stage=DenoisingStage(...)) - self.add_stage(stage_name="decoding_stage", stage=DecodingStage(...)) + self.add_stage(InputValidationStage()) + self.add_stage(ImageEncodingStage(...)) + self.add_stage(ImageVAEEncodingStage(...)) + self.add_stage(TimestepPreparationStage(...)) + self.add_stage(LatentPreparationStage(...)) + self.add_stage(DenoisingStage(...)) + self.add_stage(DecodingStage(...)) ``` The pipeline is constructed by adding stages in order. `Qwen-Image-Edit` uses `ImageEncodingStage` (for prompt and image processing) and `ImageVAEEncodingStage` (for latent extraction) before standard denoising and decoding. diff --git a/docs/get_started/install.md b/docs/get_started/install.md index 259e4b646a70..cee24d05c06f 100644 --- a/docs/get_started/install.md +++ b/docs/get_started/install.md @@ -11,22 +11,34 @@ It is recommended to use uv for faster installation: ```bash pip install --upgrade pip pip install uv -uv pip install "sglang" +uv pip install sglang ``` -**Quick fixes to common problems** -- For CUDA 13, Docker is recommended (see Method 3 note on B300/GB300/CUDA 13). If you do not have Docker access, an extra index url needs to be provided when installing wheels: +### For CUDA 13 + +Docker is recommended (see Method 3 note on B300/GB300/CUDA 13). If you do not have Docker access, follow these steps: + +1. Install PyTorch with CUDA 13 support first: +```bash +# Replace X.Y.Z with the version by your SGLang install +uv pip install torch==X.Y.Z torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 ``` -uv pip install "sglang" --extra-index-url https://download.pytorch.org/whl/cu130 + +2. Install sglang: +```bash +uv pip install sglang ``` -- The `sgl_kernel` wheel for CUDA 13 can be downloaded from [the sgl-project whl releases](https://github.com/sgl-project/whl/blob/gh-pages/cu130/sgl-kernel/index.html). Replace `X.Y.Z` with the `sgl_kernel` version required by your SGLang install (you can find this by running `uv pip show sgl_kernel`). Examples: - ```bash - # x86_64 - uv pip install "https://github.com/sgl-project/whl/releases/download/vX.Y.Z/sgl_kernel-X.Y.Z+cu130-cp310-abi3-manylinux2014_x86_64.whl" - - # aarch64 - uv pip install "https://github.com/sgl-project/whl/releases/download/vX.Y.Z/sgl_kernel-X.Y.Z+cu130-cp310-abi3-manylinux2014_aarch64.whl" - ``` + +3. Install the `sgl_kernel` wheel for CUDA 13 from [the sgl-project whl releases](https://github.com/sgl-project/whl/blob/gh-pages/cu130/sgl-kernel/index.html). Replace `X.Y.Z` with the `sgl_kernel` version required by your SGLang install (you can find this by running `uv pip show sgl_kernel`). Examples: +```bash +# x86_64 +uv pip install "https://github.com/sgl-project/whl/releases/download/vX.Y.Z/sgl_kernel-X.Y.Z+cu130-cp310-abi3-manylinux2014_x86_64.whl" + +# aarch64 +uv pip install "https://github.com/sgl-project/whl/releases/download/vX.Y.Z/sgl_kernel-X.Y.Z+cu130-cp310-abi3-manylinux2014_aarch64.whl" +``` + +### **Quick fixes to common problems** - If you encounter `OSError: CUDA_HOME environment variable is not set`. Please set it to your CUDA install root with either of the following solutions: 1. Use `export CUDA_HOME=/usr/local/cuda-` to set the `CUDA_HOME` environment variable. 2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above. @@ -35,7 +47,7 @@ uv pip install "sglang" --extra-index-url https://download.pytorch.org/whl/cu130 ```bash # Use the last release branch -git clone -b v0.5.6.post2 https://github.com/sgl-project/sglang.git +git clone -b v0.5.9 https://github.com/sgl-project/sglang.git cd sglang # Install the python packages diff --git a/docs/index.rst b/docs/index.rst index b823bd2b71d4..def5f59599c3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -51,6 +51,7 @@ Its core features include: advanced_features/quantization.md advanced_features/quantized_kv_cache.md advanced_features/expert_parallelism.md + advanced_features/dp_dpa_smg_guide.md advanced_features/lora.ipynb advanced_features/pd_disaggregation.md advanced_features/epd_disaggregation.md @@ -60,6 +61,7 @@ Its core features include: advanced_features/vlm_query.ipynb advanced_features/dp_for_multi_modal_encoder.md advanced_features/cuda_graph_for_multi_modal_encoder.md + advanced_features/piecewise_cuda_graph.md advanced_features/sgl_model_gateway.md advanced_features/deterministic_inference.md advanced_features/observability.md @@ -67,21 +69,33 @@ Its core features include: advanced_features/sglang_for_rl.md .. toctree:: - :maxdepth: 1 + :maxdepth: 2 :caption: Supported Models - supported_models/generative_models.md - supported_models/multimodal_language_models.md - supported_models/diffusion_language_models.md - supported_models/diffusion_models.md - supported_models/embedding_models.md - supported_models/reward_models.md - supported_models/rerank_models.md - supported_models/classify_models.md - supported_models/support_new_models.md - supported_models/transformers_fallback.md - supported_models/modelscope.md - supported_models/mindspore_models.md + supported_models/text_generation/index + supported_models/retrieval_ranking/index + supported_models/specialized/index + supported_models/extending/index + +.. toctree:: + :maxdepth: 2 + :caption: SGLang Diffusion + + diffusion/index + diffusion/installation + diffusion/compatibility_matrix + diffusion/api/cli + diffusion/api/openai_api + diffusion/performance/index + diffusion/performance/attention_backends + diffusion/performance/profiling + diffusion/performance/cache/index + diffusion/performance/cache/cache_dit + diffusion/performance/cache/teacache + diffusion/support_new_models + diffusion/contributing + diffusion/ci_perf + diffusion/environment_variables .. toctree:: :maxdepth: 1 @@ -117,6 +131,7 @@ Its core features include: references/custom_chat_template.md references/frontend/frontend_index.rst references/post_training_integration.md + references/release_lookup references/learn_more.md .. toctree:: diff --git a/docs/performance_dashboard/app.js b/docs/performance_dashboard/app.js index d8b05c3d225f..8bfb12b2ed0c 100644 --- a/docs/performance_dashboard/app.js +++ b/docs/performance_dashboard/app.js @@ -14,21 +14,26 @@ let currentMetricType = 'throughput'; // throughput, latency, ttft, inputThrough // Metric type definitions const metricTypes = { - throughput: { label: 'Overall Throughput', unit: 'tokens/sec', field: 'throughput' }, - outputThroughput: { label: 'Output Throughput', unit: 'tokens/sec', field: 'outputThroughput' }, - inputThroughput: { label: 'Input Throughput', unit: 'tokens/sec', field: 'inputThroughput' }, - latency: { label: 'Latency', unit: 'ms', field: 'latency' }, - ttft: { label: 'Time to First Token', unit: 'ms', field: 'ttft' }, - accLength: { label: 'Accept Length', unit: 'tokens', field: 'accLength', filterInvalid: true } + // Text/VLM metrics + throughput: { label: 'Overall Throughput', unit: 'tokens/sec', field: 'throughput', type: 'text' }, + outputThroughput: { label: 'Output Throughput', unit: 'tokens/sec', field: 'outputThroughput', type: 'text' }, + inputThroughput: { label: 'Input Throughput', unit: 'tokens/sec', field: 'inputThroughput', type: 'text' }, + latency: { label: 'Latency', unit: 'ms', field: 'latency', type: 'text' }, + ttft: { label: 'Time to First Token', unit: 'ms', field: 'ttft', type: 'text' }, + accLength: { label: 'Accept Length', unit: 'tokens', field: 'accLength', filterInvalid: true, type: 'text' }, + // Diffusion metrics + e2eMs: { label: 'End-to-End Time', unit: 'ms', field: 'e2e_ms', type: 'diffusion' }, + avgDenoiseMs: { label: 'Avg Denoise Time', unit: 'ms', field: 'avg_denoise_ms', type: 'diffusion' }, + medianDenoiseMs: { label: 'Median Denoise Time', unit: 'ms', field: 'median_denoise_ms', type: 'diffusion' } }; // Chart.js default configuration for dark theme -Chart.defaults.color = '#8b949e'; -Chart.defaults.borderColor = '#30363d'; +Chart.defaults.color = '#94a3b8'; +Chart.defaults.borderColor = '#1e293b'; const chartColors = [ - '#58a6ff', '#3fb950', '#d29922', '#f85149', '#a371f7', - '#79c0ff', '#56d364', '#e3b341', '#ff7b72', '#bc8cff' + '#22d3ee', '#34d399', '#fbbf24', '#f87171', '#a78bfa', + '#67e8f9', '#6ee7b7', '#fcd34d', '#fca5a5', '#c4b5fd' ]; // Initialize the dashboard @@ -53,7 +58,7 @@ async function init() { async function loadData() { // Try local server API first (if running server.py) try { - const response = await fetch('/api/metrics'); + const response = await fetch('/api/metrics', { headers: getAuthHeaders() }); if (response.ok) { const data = await response.json(); if (data.length > 0 && data[0].results && data[0].results.length > 0) { @@ -142,32 +147,51 @@ async function fetchMetricsForRun(run) { } } +// Helper function to detect if result is diffusion type +function isDiffusionResult(result) { + return result.test_type === 'diffusion' || (result.tests && !result.benchmarks); +} + // Populate filter dropdowns function populateFilters() { const gpuConfigs = new Set(); const models = new Set(); + const testNames = new Set(); // For diffusion tests const batchSizes = new Set(); const ioLengths = new Set(); allMetricsData.forEach(run => { run.results.forEach(result => { gpuConfigs.add(result.gpu_config); - models.add(result.model); - // Try new structure first (benchmarks_by_io_len), fall back to flat benchmarks - if (result.benchmarks_by_io_len) { - Object.entries(result.benchmarks_by_io_len).forEach(([ioKey, ioData]) => { - ioLengths.add(ioKey); - ioData.benchmarks.forEach(bench => { + + // Handle diffusion results + if (isDiffusionResult(result)) { + models.add(result.test_suite || 'diffusion'); + if (result.tests) { + result.tests.forEach(test => { + testNames.add(test.test_name); + }); + } + } + // Handle text/VLM results + else { + models.add(result.model); + // Try new structure first (benchmarks_by_io_len), fall back to flat benchmarks + if (result.benchmarks_by_io_len) { + Object.entries(result.benchmarks_by_io_len).forEach(([ioKey, ioData]) => { + ioLengths.add(ioKey); + ioData.benchmarks.forEach(bench => { + batchSizes.add(bench.batch_size); + }); + }); + } else if (result.benchmarks) { + result.benchmarks.forEach(bench => { batchSizes.add(bench.batch_size); + if (bench.input_len && bench.output_len) { + ioLengths.add(`${bench.input_len}_${bench.output_len}`); + } }); - }); - } else if (result.benchmarks) { - result.benchmarks.forEach(bench => { - batchSizes.add(bench.batch_size); - if (bench.input_len && bench.output_len) { - ioLengths.add(`${bench.input_len}_${bench.output_len}`); - } - }); + } } }); }); @@ -345,7 +369,16 @@ function createMetricTabs() { const tabsContainer = document.getElementById('metric-tabs'); tabsContainer.innerHTML = ''; - Object.entries(metricTypes).forEach(([key, metric], index) => { + // Detect if current data is diffusion or text + const isDiffusion = detectCurrentDataType() === 'diffusion'; + const dataType = isDiffusion ? 'diffusion' : 'text'; + + // Filter metrics based on data type + const relevantMetrics = Object.entries(metricTypes).filter(([key, metric]) => + metric.type === dataType + ); + + relevantMetrics.forEach(([key, metric], index) => { const tab = document.createElement('div'); tab.className = index === 0 ? 'tab active' : 'tab'; tab.textContent = metric.label; @@ -353,6 +386,31 @@ function createMetricTabs() { tab.onclick = () => selectMetricTab(key, tab); tabsContainer.appendChild(tab); }); + + // Set initial metric type + if (relevantMetrics.length > 0) { + currentMetricType = relevantMetrics[0][0]; + } +} + +function detectCurrentDataType() { + // Check if currently selected model/GPU config has diffusion data + const gpuFilter = document.getElementById('gpu-filter')?.value; + const modelFilter = currentModel; + + if (!gpuFilter || !modelFilter) return 'text'; + + for (const run of allMetricsData) { + for (const result of run.results) { + if (result.gpu_config === gpuFilter) { + const resultModel = result.test_suite || result.model; + if (resultModel === modelFilter && isDiffusionResult(result)) { + return 'diffusion'; + } + } + } + } + return 'text'; } function selectMetricTab(metricKey, tabElement) { @@ -374,6 +432,8 @@ function handleModelFilterChange(model) { updateVariantFilter(); // Update IO length filter based on new model selection updateIoLenFilter(); + // Recreate metric tabs in case data type changed (text vs diffusion) + createMetricTabs(); updateCharts(); } @@ -383,6 +443,8 @@ function handleGpuFilterChange() { updateVariantFilter(); // Update IO length filter based on new GPU selection updateIoLenFilter(); + // Recreate metric tabs in case data type changed (text vs diffusion) + createMetricTabs(); updateCharts(); } @@ -518,6 +580,7 @@ function prepareChartData(gpuFilter, modelFilter, variantFilter, ioLenFilter, ba // Prepare chart data grouped by batch size - each batch size is a separate series function prepareChartDataByBatch(gpuFilter, modelFilter, variantFilter, ioLenFilter, batchFilter) { const batchDataMap = new Map(); // batch_size -> Map of variant -> data + const testDataMap = new Map(); // For diffusion: test_name -> data allMetricsData.forEach(run => { const runDate = new Date(run.run_date); @@ -525,6 +588,37 @@ function prepareChartDataByBatch(gpuFilter, modelFilter, variantFilter, ioLenFil run.results.forEach(result => { // Apply filters - GPU and Model are required (no "all" option) if (result.gpu_config !== gpuFilter) return; + + // Handle diffusion results + if (isDiffusionResult(result)) { + const resultModel = result.test_suite || 'diffusion'; + if (resultModel !== modelFilter) return; + + if (result.tests) { + result.tests.forEach(test => { + const testName = test.test_name; + if (!testDataMap.has(testName)) { + testDataMap.set(testName, { + label: testName, + data: [], + model: resultModel, + testName: testName + }); + } + + testDataMap.get(testName).data.push({ + x: runDate, + e2e_ms: test.e2e_ms, + avg_denoise_ms: test.avg_denoise_ms, + median_denoise_ms: test.median_denoise_ms, + runId: run.run_id + }); + }); + } + return; + } + + // Handle text/VLM results if (result.model !== modelFilter) return; if (variantFilter !== 'all' && result.variant !== variantFilter) return; @@ -622,6 +716,17 @@ function prepareChartDataByBatch(gpuFilter, modelFilter, variantFilter, ioLenFil // Sort data points by date and convert to array format const result = {}; + + // For diffusion data, use test names as "batch sizes" + if (testDataMap.size > 0) { + testDataMap.forEach((series, testName) => { + series.data.sort((a, b) => a.x - b.x); + result[testName] = [series]; // Each test is its own series + }); + return result; + } + + // For text/VLM data, use batch sizes batchDataMap.forEach((variantMap, batchSize) => { variantMap.forEach(series => { series.data.sort((a, b) => a.x - b.x); @@ -642,7 +747,16 @@ function updateMetricChart(chartDataByBatch, metricType) { activeCharts = []; const metric = metricTypes[metricType]; - const batchSizes = Object.keys(chartDataByBatch).sort((a, b) => parseInt(a) - parseInt(b)); + const isDiffusion = metric.type === 'diffusion'; + + // For diffusion, keys are test names; for text, keys are batch sizes + const keys = Object.keys(chartDataByBatch); + if (!isDiffusion) { + keys.sort((a, b) => parseInt(a) - parseInt(b)); + } else { + keys.sort(); // Alphabetical sort for test names + } + const batchSizes = keys; // Keep variable name for compatibility if (batchSizes.length === 0) { container.innerHTML = '
No data available for the selected filters
'; @@ -682,7 +796,8 @@ function updateMetricChart(chartDataByBatch, metricType) { const title = document.createElement('div'); title.className = 'batch-chart-title'; - title.textContent = `Batch Size: ${batchSize}`; + // For diffusion, show test name; for text, show batch size + title.textContent = isDiffusion ? `Test: ${batchSize}` : `Batch Size: ${batchSize}`; chartWrapper.appendChild(title); const chartContainer = document.createElement('div'); @@ -726,12 +841,13 @@ function getChartOptions(yAxisLabel) { } }, tooltip: { - backgroundColor: '#21262d', - borderColor: '#30363d', + backgroundColor: '#1a2332', + borderColor: 'rgba(148, 163, 184, 0.1)', borderWidth: 1, - titleFont: { size: 13 }, - bodyFont: { size: 12 }, - padding: 12 + titleFont: { size: 13, family: "'DM Sans', sans-serif" }, + bodyFont: { size: 12, family: "'JetBrains Mono', monospace" }, + padding: 14, + cornerRadius: 8 } }, scales: { @@ -744,7 +860,7 @@ function getChartOptions(yAxisLabel) { } }, grid: { - color: '#21262d' + color: 'rgba(148, 163, 184, 0.06)' } }, y: { @@ -753,7 +869,7 @@ function getChartOptions(yAxisLabel) { text: yAxisLabel }, grid: { - color: '#21262d' + color: 'rgba(148, 163, 184, 0.06)' } } } @@ -832,5 +948,109 @@ function formatNumber(num) { return num.toFixed(1); } +// Authentication state +let authToken = sessionStorage.getItem('dashboard_auth_token') || null; + +// Get auth headers for API requests +function getAuthHeaders() { + const headers = {}; + if (authToken) { + headers['Authorization'] = `Bearer ${authToken}`; + } + return headers; +} + +// Check if server requires authentication and show/hide login accordingly +async function checkAuthAndInit() { + const loginOverlay = document.getElementById('login-overlay'); + const dashboardContainer = document.getElementById('dashboard-container'); + + try { + const response = await fetch('/api/auth-check'); + if (response.ok) { + const data = await response.json(); + if (!data.auth_required) { + // No auth required - skip login, show dashboard directly + loginOverlay.style.display = 'none'; + dashboardContainer.style.display = 'block'; + init(); + return; + } + } + } catch (e) { + // Server not available (e.g. static hosting) - skip login + loginOverlay.style.display = 'none'; + dashboardContainer.style.display = 'block'; + init(); + return; + } + + // Auth is required - check if we have a valid token from a previous session + if (authToken) { + try { + const testResponse = await fetch('/api/metrics', { + headers: getAuthHeaders() + }); + if (testResponse.ok) { + loginOverlay.style.display = 'none'; + dashboardContainer.style.display = 'block'; + init(); + return; + } + } catch (e) { + // Token invalid or expired + } + // Clear invalid token + authToken = null; + sessionStorage.removeItem('dashboard_auth_token'); + } + + // Show login form + loginOverlay.style.display = 'flex'; + dashboardContainer.style.display = 'none'; +} + +// Handle login form submission +async function handleLogin(event) { + event.preventDefault(); + + const username = document.getElementById('login-username').value; + const password = document.getElementById('login-password').value; + const errorEl = document.getElementById('login-error'); + const loginBtn = document.getElementById('login-btn'); + + errorEl.textContent = ''; + loginBtn.disabled = true; + loginBtn.textContent = 'Signing in...'; + + try { + const response = await fetch('/api/login', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ username, password }) + }); + + const data = await response.json(); + + if (response.ok && data.token) { + authToken = data.token; + sessionStorage.setItem('dashboard_auth_token', authToken); + + document.getElementById('login-overlay').style.display = 'none'; + document.getElementById('dashboard-container').style.display = 'block'; + init(); + } else { + errorEl.textContent = data.error || 'Invalid username or password'; + } + } catch (e) { + errorEl.textContent = 'Unable to connect to server'; + } finally { + loginBtn.disabled = false; + loginBtn.textContent = 'Sign In'; + } + + return false; +} + // Initialize on page load -document.addEventListener('DOMContentLoaded', init); +document.addEventListener('DOMContentLoaded', checkAuthAndInit); diff --git a/docs/performance_dashboard/index.html b/docs/performance_dashboard/index.html index 6bd63d72d008..1c5b57bafe38 100644 --- a/docs/performance_dashboard/index.html +++ b/docs/performance_dashboard/index.html @@ -6,18 +6,39 @@ SGLang Performance Dashboard + + + -
-
+ + + +