sgl-project · mingfeima · May 12, 2026 · Apr 13, 2026 · gemini-code-assist · May 9, 2026
@@ -115,7 +115,7 @@ jobs:
         timeout-minutes: 36
         run: |
           docker exec -w /sglang-checkout/ ci_sglang_xeon \
-            bash -c "source /opt/.venv/bin/activate && cd ./test/srt && python3 run_suite.py --suite per-commit-cpu --timeout-per-file 1500"
+            bash -c "source /opt/.venv/bin/activate && cd ./test && python3 run_suite.py --hw cpu --suite stage-b-test-cpu"
 
       - name: Change permission
         timeout-minutes: 2

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -92,6 +92,7 @@ repos:
         entry: python3 scripts/ci/check_registered_tests.py
         language: system
         files: ^test/registered/.*\.py$
+        exclude: ^test/registered/.*/utils\.py$
         pass_filenames: false
       - id: check-no-docs-changes
         name: reject changes under legacy docs/

@@ -83,7 +83,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
     }}
   >
     <a
-      href="https://lmsys.org/blog/2026-04-25-deepseek-v4/"
+      href="https://lmsys.org/blog/2026-04-29-p2p-update/"
       target="_blank"
       rel="noopener noreferrer"
       style={{
@@ -104,8 +104,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
         }}
       >
         <img
-          src="https://lmsys.org/images/blog/deepseek_v4/benchmark_vs_oss.png"
-          alt="DeepSeek-V4 on Day 0: From Fast Inference to Verified RL with SGLang and Miles"
+          src="https://lmsys.org/images/blog/p2p-update/p2p-overview.png"
+          alt="Updating 1T parameters in seconds \u2014 P2P weight transfer in Large Scale Distributed RL"
           style={{
             width: "100%",
             height: "100%",
@@ -124,7 +124,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             fontSize: "0.98rem",
           }}
         >
-          {"DeepSeek-V4 on Day 0: From Fast Inference to Verified RL with SGLang and Miles"}
+          {"Updating 1T parameters in seconds \u2014 P2P weight transfer in Large Scale Distributed RL"}
         </p>
         <p
           style={{
@@ -133,12 +133,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             opacity: 0.75,
           }}
         >
-          {"April 25, 2026"}
+          {"April 29, 2026"}
         </p>
       </div>
     </a>
     <a
-      href="https://lmsys.org/blog/2026-04-10-sglang-hisparse/"
+      href="https://lmsys.org/blog/2026-04-25-deepseek-v4/"
       target="_blank"
       rel="noopener noreferrer"
       style={{
@@ -159,8 +159,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
         }}
       >
         <img
-          src="https://lmsys.org/images/blog/hisparse/hisparse_overview.png"
-          alt="HiSparse: Turbocharging Sparse Attention with Hierarchical Memory"
+          src="https://lmsys.org/images/blog/deepseek_v4/benchmark_vs_oss.png"
+          alt="DeepSeek-V4 on Day 0: From Fast Inference to Verified RL with SGLang and Miles"
           style={{
             width: "100%",
             height: "100%",
@@ -179,7 +179,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             fontSize: "0.98rem",
           }}
         >
-          {"HiSparse: Turbocharging Sparse Attention with Hierarchical Memory"}
+          {"DeepSeek-V4 on Day 0: From Fast Inference to Verified RL with SGLang and Miles"}
         </p>
         <p
           style={{
@@ -188,12 +188,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             opacity: 0.75,
           }}
         >
-          {"April 10, 2026"}
+          {"April 25, 2026"}
         </p>
       </div>
     </a>
     <a
-      href="https://lmsys.org/blog/2026-03-25-gtc2026/"
+      href="https://lmsys.org/blog/2026-04-10-sglang-hisparse/"
       target="_blank"
       rel="noopener noreferrer"
       style={{
@@ -214,8 +214,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
         }}
       >
         <img
-          src="https://lmsys.org/images/blog/gtc2026/happyhour-crowd.jpg"
-          alt="Highlights of SGLang at NVIDIA GTC 2026"
+          src="https://lmsys.org/images/blog/hisparse/hisparse_overview.png"
+          alt="HiSparse: Turbocharging Sparse Attention with Hierarchical Memory"
           style={{
             width: "100%",
             height: "100%",
@@ -234,7 +234,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             fontSize: "0.98rem",
           }}
         >
-          {"Highlights of SGLang at NVIDIA GTC 2026"}
+          {"HiSparse: Turbocharging Sparse Attention with Hierarchical Memory"}
         </p>
         <p
           style={{
@@ -243,12 +243,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             opacity: 0.75,
           }}
         >
-          {"March 31, 2026"}
+          {"April 10, 2026"}
         </p>
       </div>
     </a>
     <a
-      href="https://lmsys.org/blog/2026-03-25-eep-partial-failure-tolerance/"
+      href="https://lmsys.org/blog/2026-03-25-gtc2026/"
       target="_blank"
       rel="noopener noreferrer"
       style={{
@@ -269,8 +269,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
         }}
       >
         <img
-          src="https://lmsys.org/images/blog/eep-partial-failure-tolerance/figure.png"
-          alt="Elastic EP in SGLang: Achieving Partial Failure Tolerance for DeepSeek MoE Deployments"
+          src="https://lmsys.org/images/blog/gtc2026/happyhour-crowd.jpg"
+          alt="Highlights of SGLang at NVIDIA GTC 2026"
           style={{
             width: "100%",
             height: "100%",
@@ -289,7 +289,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             fontSize: "0.98rem",
           }}
         >
-          {"Elastic EP in SGLang: Achieving Partial Failure Tolerance for DeepSeek MoE Deployments"}
+          {"Highlights of SGLang at NVIDIA GTC 2026"}
         </p>
         <p
           style={{
@@ -298,12 +298,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             opacity: 0.75,
           }}
         >
-          {"March 25, 2026"}
+          {"March 31, 2026"}
         </p>
       </div>
     </a>
     <a
-      href="https://lmsys.org/blog/2026-03-17-rocm-miles-rl-amd/"
+      href="https://lmsys.org/blog/2026-03-25-eep-partial-failure-tolerance/"
       target="_blank"
       rel="noopener noreferrer"
       style={{
@@ -324,8 +324,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
         }}
       >
         <img
-          src="https://lmsys.org/images/blog/rocm_miles_rl/fig_1.png"
-          alt="ROCm Support for Miles: Large-Scale RL Post-Training on AMD Instinct\u2122 GPUs"
+          src="https://lmsys.org/images/blog/eep-partial-failure-tolerance/figure.png"
+          alt="Elastic EP in SGLang: Achieving Partial Failure Tolerance for DeepSeek MoE Deployments"
           style={{
             width: "100%",
             height: "100%",
@@ -344,7 +344,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             fontSize: "0.98rem",
           }}
         >
-          {"ROCm Support for Miles: Large-Scale RL Post-Training on AMD Instinct\u2122 GPUs"}
+          {"Elastic EP in SGLang: Achieving Partial Failure Tolerance for DeepSeek MoE Deployments"}
         </p>
         <p
           style={{
@@ -353,12 +353,12 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             opacity: 0.75,
           }}
         >
-          {"March 17, 2026"}
+          {"March 25, 2026"}
         </p>
       </div>
     </a>
     <a
-      href="https://lmsys.org/blog/2026-03-11-run-nvidia-nemotron-3-super/"
+      href="https://lmsys.org/blog/2026-03-17-rocm-miles-rl-amd/"
       target="_blank"
       rel="noopener noreferrer"
       style={{
@@ -379,8 +379,8 @@ It is designed to deliver low-latency and high-throughput inference across a wid
         }}
       >
         <img
-          src="https://lmsys.org/images/blog/nemotron-3-super/figure_1.svg"
-          alt="SGLang Adds Day-0 Support for NVIDIA Nemotron 3 Super for building High-Efficiency Multi-Agent Systems"
+          src="https://lmsys.org/images/blog/rocm_miles_rl/fig_1.png"
+          alt="ROCm Support for Miles: Large-Scale RL Post-Training on AMD Instinct\u2122 GPUs"
           style={{
             width: "100%",
             height: "100%",
@@ -399,7 +399,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             fontSize: "0.98rem",
           }}
         >
-          {"SGLang Adds Day-0 Support for NVIDIA Nemotron 3 Super for building High-Efficiency Multi-Agent Systems"}
+          {"ROCm Support for Miles: Large-Scale RL Post-Training on AMD Instinct\u2122 GPUs"}
         </p>
         <p
           style={{
@@ -408,7 +408,7 @@ It is designed to deliver low-latency and high-throughput inference across a wid
             opacity: 0.75,
           }}
         >
-          {"March 11, 2026"}
+          {"March 17, 2026"}
         </p>
       </div>
     </a>

diff --git a/scripts/ci/check_registered_tests.py b/scripts/ci/check_registered_tests.py
@@ -22,11 +22,11 @@ def main() -> int:
     ci_register = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(ci_register)
 
-    # Same filter as run_suite.py: skip conftest.py and __init__.py
+    # Same filter as run_suite.py: skip conftest.py, __init__.py, and utils.py
     files = sorted(
         f
         for f in glob.glob("test/registered/**/*.py", recursive=True)
-        if os.path.basename(f) not in ("conftest.py", "__init__.py")
+        if os.path.basename(f) not in ("conftest.py", "__init__.py", "utils.py")
     )
     if not files:
         return 0

diff --git a/test/registered/cpu/test_activation.py b/test/registered/cpu/test_activation.py
@@ -0,0 +1,59 @@
+import itertools
+import unittest
+
+import torch
+from utils import GeluAndMul, SiluAndMul, precision
+
+from sglang.srt.server_args import ServerArgs, set_global_server_args_for_scheduler
+from sglang.test.ci.ci_register import register_cpu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_cpu_ci(est_time=10, suite="stage-b-test-cpu")
+
+torch.manual_seed(1234)
+
+
+class TestActivation(CustomTestCase):
+    M = [128, 129, 257]
+    N = [22016, 22018]
+    dtype = [torch.float16, torch.bfloat16]
+
+    def _silu_and_mul_test(self, m, n, dtype):
+        set_global_server_args_for_scheduler(ServerArgs(model_path="dummy"))
+
+        x = torch.randn([m, n], dtype=dtype)
+
+        out = torch.ops.sgl_kernel.silu_and_mul_cpu(x)
+        ref_out = SiluAndMul(x)
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def _gelu_and_mul_test(self, m, n, dtype):
+        x = torch.randn([m, n], dtype=dtype)
+
+        out = torch.ops.sgl_kernel.gelu_and_mul_cpu(x)
+        ref_out = GeluAndMul(x, approximate="none")
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def _gelu_tanh_and_mul_test(self, m, n, dtype):
+        x = torch.randn([m, n], dtype=dtype)
+
+        out = torch.ops.sgl_kernel.gelu_tanh_and_mul_cpu(x)
+        ref_out = GeluAndMul(x, approximate="tanh")
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def test_activation(self):
+        for params in itertools.product(self.M, self.N, self.dtype):
+            with self.subTest(m=params[0], n=params[1], dtype=params[2]):
+                self._silu_and_mul_test(*params)
+                self._gelu_and_mul_test(*params)
+                self._gelu_tanh_and_mul_test(*params)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/cpu/test_binding.py b/test/registered/cpu/test_binding.py
@@ -0,0 +1,30 @@
+import re
+import unittest
+
+import torch
+
+kernel = torch.ops.sgl_kernel
+
+from sglang.test.ci.ci_register import register_cpu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_cpu_ci(est_time=10, suite="stage-b-test-cpu")
+
+
+class TestBinding(CustomTestCase):
+    def test_binding(self):
+        start_id = 1
+        n_cpu = 6
+
+        expected_cores = list(map(str, range(start_id, start_id + n_cpu)))
+        cpu_ids = ",".join(expected_cores)
+        output = kernel.init_cpu_threads_env(cpu_ids)
+
+        bindings = re.findall(r"OMP tid: \d+, core (\d+)", output)
+        self.assertEqual(len(bindings), n_cpu)
+
+        self.assertEqual(bindings, expected_cores)
+
+
+if __name__ == "__main__":
+    unittest.main()