Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pr-test-xeon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ jobs:
timeout-minutes: 36
run: |
docker exec -w /sglang-checkout/ ci_sglang_xeon \
bash -c "source /opt/.venv/bin/activate && cd ./test/srt && python3 run_suite.py --suite per-commit-cpu --timeout-per-file 1500"
bash -c "source /opt/.venv/bin/activate && cd ./test && python3 run_suite.py --hw cpu --suite stage-b-test-cpu"

- name: Change permission
timeout-minutes: 2
Expand Down
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ repos:
entry: python3 scripts/ci/check_registered_tests.py
language: system
files: ^test/registered/.*\.py$
exclude: ^test/registered/.*/utils\.py$
pass_filenames: false
- id: check-no-docs-changes
name: reject changes under legacy docs/
Expand Down
60 changes: 30 additions & 30 deletions docs_new/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
}}
>
<a
href="https://lmsys.org/blog/2026-04-25-deepseek-v4/"
href="https://lmsys.org/blog/2026-04-29-p2p-update/"
target="_blank"
rel="noopener noreferrer"
style={{
Expand All @@ -104,8 +104,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
}}
>
<img
src="https://lmsys.org/images/blog/deepseek_v4/benchmark_vs_oss.png"
alt="DeepSeek-V4 on Day 0: From Fast Inference to Verified RL with SGLang and Miles"
src="https://lmsys.org/images/blog/p2p-update/p2p-overview.png"
alt="Updating 1T parameters in seconds \u2014 P2P weight transfer in Large Scale Distributed RL"
style={{
width: "100%",
height: "100%",
Expand All @@ -124,7 +124,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
fontSize: "0.98rem",
}}
>
{"DeepSeek-V4 on Day 0: From Fast Inference to Verified RL with SGLang and Miles"}
{"Updating 1T parameters in seconds \u2014 P2P weight transfer in Large Scale Distributed RL"}
</p>
<p
style={{
Expand All @@ -133,12 +133,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
opacity: 0.75,
}}
>
{"April 25, 2026"}
{"April 29, 2026"}
</p>
</div>
</a>
<a
href="https://lmsys.org/blog/2026-04-10-sglang-hisparse/"
href="https://lmsys.org/blog/2026-04-25-deepseek-v4/"
target="_blank"
rel="noopener noreferrer"
style={{
Expand All @@ -159,8 +159,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
}}
>
<img
src="https://lmsys.org/images/blog/hisparse/hisparse_overview.png"
alt="HiSparse: Turbocharging Sparse Attention with Hierarchical Memory"
src="https://lmsys.org/images/blog/deepseek_v4/benchmark_vs_oss.png"
alt="DeepSeek-V4 on Day 0: From Fast Inference to Verified RL with SGLang and Miles"
style={{
width: "100%",
height: "100%",
Expand All @@ -179,7 +179,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
fontSize: "0.98rem",
}}
>
{"HiSparse: Turbocharging Sparse Attention with Hierarchical Memory"}
{"DeepSeek-V4 on Day 0: From Fast Inference to Verified RL with SGLang and Miles"}
</p>
<p
style={{
Expand All @@ -188,12 +188,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
opacity: 0.75,
}}
>
{"April 10, 2026"}
{"April 25, 2026"}
</p>
</div>
</a>
<a
href="https://lmsys.org/blog/2026-03-25-gtc2026/"
href="https://lmsys.org/blog/2026-04-10-sglang-hisparse/"
target="_blank"
rel="noopener noreferrer"
style={{
Expand All @@ -214,8 +214,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
}}
>
<img
src="https://lmsys.org/images/blog/gtc2026/happyhour-crowd.jpg"
alt="Highlights of SGLang at NVIDIA GTC 2026"
src="https://lmsys.org/images/blog/hisparse/hisparse_overview.png"
alt="HiSparse: Turbocharging Sparse Attention with Hierarchical Memory"
style={{
width: "100%",
height: "100%",
Expand All @@ -234,7 +234,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
fontSize: "0.98rem",
}}
>
{"Highlights of SGLang at NVIDIA GTC 2026"}
{"HiSparse: Turbocharging Sparse Attention with Hierarchical Memory"}
</p>
<p
style={{
Expand All @@ -243,12 +243,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
opacity: 0.75,
}}
>
{"March 31, 2026"}
{"April 10, 2026"}
</p>
</div>
</a>
<a
href="https://lmsys.org/blog/2026-03-25-eep-partial-failure-tolerance/"
href="https://lmsys.org/blog/2026-03-25-gtc2026/"
target="_blank"
rel="noopener noreferrer"
style={{
Expand All @@ -269,8 +269,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
}}
>
<img
src="https://lmsys.org/images/blog/eep-partial-failure-tolerance/figure.png"
alt="Elastic EP in SGLang: Achieving Partial Failure Tolerance for DeepSeek MoE Deployments"
src="https://lmsys.org/images/blog/gtc2026/happyhour-crowd.jpg"
alt="Highlights of SGLang at NVIDIA GTC 2026"
style={{
width: "100%",
height: "100%",
Expand All @@ -289,7 +289,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
fontSize: "0.98rem",
}}
>
{"Elastic EP in SGLang: Achieving Partial Failure Tolerance for DeepSeek MoE Deployments"}
{"Highlights of SGLang at NVIDIA GTC 2026"}
</p>
<p
style={{
Expand All @@ -298,12 +298,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
opacity: 0.75,
}}
>
{"March 25, 2026"}
{"March 31, 2026"}
</p>
</div>
</a>
<a
href="https://lmsys.org/blog/2026-03-17-rocm-miles-rl-amd/"
href="https://lmsys.org/blog/2026-03-25-eep-partial-failure-tolerance/"
target="_blank"
rel="noopener noreferrer"
style={{
Expand All @@ -324,8 +324,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
}}
>
<img
src="https://lmsys.org/images/blog/rocm_miles_rl/fig_1.png"
alt="ROCm Support for Miles: Large-Scale RL Post-Training on AMD Instinct\u2122 GPUs"
src="https://lmsys.org/images/blog/eep-partial-failure-tolerance/figure.png"
alt="Elastic EP in SGLang: Achieving Partial Failure Tolerance for DeepSeek MoE Deployments"
style={{
width: "100%",
height: "100%",
Expand All @@ -344,7 +344,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
fontSize: "0.98rem",
}}
>
{"ROCm Support for Miles: Large-Scale RL Post-Training on AMD Instinct\u2122 GPUs"}
{"Elastic EP in SGLang: Achieving Partial Failure Tolerance for DeepSeek MoE Deployments"}
</p>
<p
style={{
Expand All @@ -353,12 +353,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
opacity: 0.75,
}}
>
{"March 17, 2026"}
{"March 25, 2026"}
</p>
</div>
</a>
<a
href="https://lmsys.org/blog/2026-03-11-run-nvidia-nemotron-3-super/"
href="https://lmsys.org/blog/2026-03-17-rocm-miles-rl-amd/"
target="_blank"
rel="noopener noreferrer"
style={{
Expand All @@ -379,8 +379,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
}}
>
<img
src="https://lmsys.org/images/blog/nemotron-3-super/figure_1.svg"
alt="SGLang Adds Day-0 Support for NVIDIA Nemotron 3 Super for building High-Efficiency Multi-Agent Systems"
src="https://lmsys.org/images/blog/rocm_miles_rl/fig_1.png"
alt="ROCm Support for Miles: Large-Scale RL Post-Training on AMD Instinct\u2122 GPUs"
style={{
width: "100%",
height: "100%",
Expand All @@ -399,7 +399,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
fontSize: "0.98rem",
}}
>
{"SGLang Adds Day-0 Support for NVIDIA Nemotron 3 Super for building High-Efficiency Multi-Agent Systems"}
{"ROCm Support for Miles: Large-Scale RL Post-Training on AMD Instinct\u2122 GPUs"}
</p>
<p
style={{
Expand All @@ -408,7 +408,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
opacity: 0.75,
}}
>
{"March 11, 2026"}
{"March 17, 2026"}
</p>
</div>
</a>
Expand Down
4 changes: 2 additions & 2 deletions scripts/ci/check_registered_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ def main() -> int:
ci_register = importlib.util.module_from_spec(spec)
spec.loader.exec_module(ci_register)

# Same filter as run_suite.py: skip conftest.py and __init__.py
# Same filter as run_suite.py: skip conftest.py, __init__.py, and utils.py
files = sorted(
f
for f in glob.glob("test/registered/**/*.py", recursive=True)
if os.path.basename(f) not in ("conftest.py", "__init__.py")
if os.path.basename(f) not in ("conftest.py", "__init__.py", "utils.py")
)
if not files:
return 0
Expand Down
59 changes: 59 additions & 0 deletions test/registered/cpu/test_activation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import itertools
import unittest

import torch
from utils import GeluAndMul, SiluAndMul, precision

from sglang.srt.server_args import ServerArgs, set_global_server_args_for_scheduler
from sglang.test.ci.ci_register import register_cpu_ci
from sglang.test.test_utils import CustomTestCase

register_cpu_ci(est_time=10, suite="stage-b-test-cpu")

torch.manual_seed(1234)


class TestActivation(CustomTestCase):
M = [128, 129, 257]
N = [22016, 22018]
dtype = [torch.float16, torch.bfloat16]

def _silu_and_mul_test(self, m, n, dtype):
set_global_server_args_for_scheduler(ServerArgs(model_path="dummy"))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Calling set_global_server_args_for_scheduler inside _silu_and_mul_test is redundant and inefficient because it is executed for every parameter combination in the test_activation loop. It should be called once at the beginning of test_activation or in a setUp method.


x = torch.randn([m, n], dtype=dtype)

out = torch.ops.sgl_kernel.silu_and_mul_cpu(x)
ref_out = SiluAndMul(x)

atol = rtol = precision[ref_out.dtype]
torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)

def _gelu_and_mul_test(self, m, n, dtype):
x = torch.randn([m, n], dtype=dtype)

out = torch.ops.sgl_kernel.gelu_and_mul_cpu(x)
ref_out = GeluAndMul(x, approximate="none")

atol = rtol = precision[ref_out.dtype]
torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)

def _gelu_tanh_and_mul_test(self, m, n, dtype):
x = torch.randn([m, n], dtype=dtype)

out = torch.ops.sgl_kernel.gelu_tanh_and_mul_cpu(x)
ref_out = GeluAndMul(x, approximate="tanh")

atol = rtol = precision[ref_out.dtype]
torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)

def test_activation(self):
for params in itertools.product(self.M, self.N, self.dtype):
with self.subTest(m=params[0], n=params[1], dtype=params[2]):
self._silu_and_mul_test(*params)
self._gelu_and_mul_test(*params)
self._gelu_tanh_and_mul_test(*params)


if __name__ == "__main__":
unittest.main()
30 changes: 30 additions & 0 deletions test/registered/cpu/test_binding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import re
import unittest

import torch

kernel = torch.ops.sgl_kernel

from sglang.test.ci.ci_register import register_cpu_ci
from sglang.test.test_utils import CustomTestCase

register_cpu_ci(est_time=10, suite="stage-b-test-cpu")


class TestBinding(CustomTestCase):
def test_binding(self):
start_id = 1
n_cpu = 6

expected_cores = list(map(str, range(start_id, start_id + n_cpu)))
cpu_ids = ",".join(expected_cores)
output = kernel.init_cpu_threads_env(cpu_ids)

bindings = re.findall(r"OMP tid: \d+, core (\d+)", output)
self.assertEqual(len(bindings), n_cpu)

self.assertEqual(bindings, expected_cores)


if __name__ == "__main__":
unittest.main()
Loading
Loading