sgl-project · Kangyan-Zhou · Dec 23, 2025 · Dec 21, 2025 · Dec 21, 2025 · Dec 21, 2025
@@ -522,6 +522,43 @@ jobs:
           cd test/
           python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu
 
+  stage-c-test-large-4-gpu:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, sgl-kernel-build-wheels]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-c-test-large-4-gpu') ||
+        (
+          !inputs.target_stage &&
+          (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    runs-on: 4-gpu-h100
+    env:
+      RUNNER_LABELS: 4-gpu-h100
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install dependencies
+        run: |
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/
+          python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu
+
   multimodal-gen-test-1-gpu:
     needs: [check-changes, call-gate, sgl-kernel-build-wheels]
     if: |
@@ -1402,6 +1439,7 @@ jobs:
         stage-b-test-small-1-gpu,
         stage-b-test-large-1-gpu,
         stage-b-test-large-2-gpu,
+        stage-c-test-large-4-gpu,
         quantization-test,
         unit-test-backend-1-gpu,
         unit-test-backend-2-gpu,

diff --git a/scripts/ci/slash_command_handler.py b/scripts/ci/slash_command_handler.py
@@ -147,6 +147,7 @@ def handle_rerun_stage(
         "stage-b-test-small-1-gpu",
         "stage-b-test-large-1-gpu",
         "stage-b-test-large-2-gpu",
+        "stage-c-test-large-4-gpu",
         "multimodal-gen-test-1-gpu",
         "multimodal-gen-test-2-gpu",
         "quantization-test",

diff --git a/test/srt/test_create_kvindices.py → ...stered/attention/test_create_kvindices.py b/test/srt/test_create_kvindices.py → ...stered/attention/test_create_kvindices.py
@@ -4,8 +4,12 @@
 import torch
 
 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_utils import CustomTestCase
 
+# Triton kernel unit test for KV indices creation
+register_cuda_ci(est_time=10, suite="stage-b-test-small-1-gpu")
+
 
 class TestCreateKvIndices(CustomTestCase):
     @classmethod

diff --git a/test/srt/test_fa3.py → test/registered/attention/test_fa3.py b/test/srt/test_fa3.py → test/registered/attention/test_fa3.py
@@ -5,6 +5,7 @@
 
 from sglang.srt.environ import envs
 from sglang.srt.utils import get_device_sm, kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
@@ -17,6 +18,10 @@
     popen_launch_server,
 )
 
+# FlashAttention3 integration tests (requires SM 90+ / H100)
+# Multiple test classes: FA3, FA3+MLA, FA3+SpecDecode variants
+register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu")
+
 GSM_DATASET_PATH = None
 
 # In case of some machine lack internet connection, we can set OFFLINE_MODE to True.

diff --git a/test/srt/test_flash_attention_4.py → ...tered/attention/test_flash_attention_4.py b/test/srt/test_flash_attention_4.py → ...tered/attention/test_flash_attention_4.py
@@ -3,13 +3,17 @@
 from urllib.parse import urlparse
 
 from sglang.srt.utils import get_device_sm, kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     popen_launch_server,
 )
 
+# FlashAttention4 integration test (requires SM 100+ / Blackwell B200)
+register_cuda_ci(est_time=200, suite="stage-b-test-large-1-gpu")
+
 
 @unittest.skipIf(get_device_sm() < 100, "Test requires CUDA SM 100 or higher")
 class TestFlashAttention4(unittest.TestCase):

diff --git a/test/srt/test_hybrid_attn_backend.py → ...red/attention/test_hybrid_attn_backend.py b/test/srt/test_hybrid_attn_backend.py → ...red/attention/test_hybrid_attn_backend.py
@@ -5,6 +5,7 @@
 
 from sglang.srt.environ import envs
 from sglang.srt.utils import get_device_sm, kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
     DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
@@ -17,6 +18,10 @@
     popen_launch_server,
 )
 
+# Hybrid attention backend tests (FA3 prefill + FlashInfer decode, requires SM 90+ / H100)
+# Multiple test classes: base, MLA, TorchCompile, SpecDecode variants
+register_cuda_ci(est_time=200, suite="stage-b-test-large-1-gpu")
+
 GSM_DATASET_PATH = None
 
 # Default server arguments shared across all tests

diff --git a/test/srt/test_local_attn.py → test/registered/attention/test_local_attn.py b/test/srt/test_local_attn.py → test/registered/attention/test_local_attn.py
@@ -5,6 +5,7 @@
 import requests
 
 from sglang.srt.utils import get_device_sm, kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION,
@@ -14,6 +15,9 @@
     popen_launch_server,
 )
 
+# Local attention with FA3 (requires SM 90+ / H100, tp=4)
+register_cuda_ci(est_time=200, suite="stage-c-test-large-4-gpu")
+
 
 @unittest.skipIf(get_device_sm() < 90, "Test requires CUDA SM 90 or higher")
 class TestFlashAttention3LocalAttn(CustomTestCase):

diff --git a/test/srt/test_radix_attention.py → ...istered/attention/test_radix_attention.py b/test/srt/test_radix_attention.py → ...istered/attention/test_radix_attention.py
@@ -1,6 +1,7 @@
 import unittest
 
 from sglang.srt.environ import envs
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.kits.radix_cache_server_kit import run_radix_attention_test
 from sglang.test.test_utils import (
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
@@ -12,6 +13,9 @@
     popen_launch_server,
 )
 
+# RadixAttention server integration tests
+register_cuda_ci(est_time=100, suite="stage-b-test-small-1-gpu")
+
 
 class TestRadixCacheFCFS(CustomTestCase):
     @classmethod

diff --git a/test/srt/test_radix_cache_unit.py → ...stered/attention/test_radix_cache_unit.py b/test/srt/test_radix_cache_unit.py → ...stered/attention/test_radix_cache_unit.py
@@ -17,6 +17,11 @@
     python -m pytest test_radix_cache_unit.py::TestRadixCache::test_insert_basic
 """
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
+# CPU-based unit test, runs quickly on any GPU runner
+register_cuda_ci(est_time=5, suite="stage-b-test-small-1-gpu")
+
 import time
 import unittest
 import unittest.mock

diff --git a/...rt/test_torch_native_attention_backend.py → ...on/test_torch_native_attention_backend.py b/...rt/test_torch_native_attention_backend.py → ...on/test_torch_native_attention_backend.py
@@ -1,12 +1,13 @@
 """
 Usage:
-python3 -m unittest test_triton_attention_backend.TestTritonAttnBackend.test_mmlu
+python3 -m unittest test_torch_native_attention_backend.TestTorchNativeAttnBackend.test_mmlu
 """
 
 import unittest
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
@@ -16,6 +17,9 @@
     popen_launch_server,
 )
 
+# Torch native attention backend integration test with MMLU eval
+register_cuda_ci(est_time=150, suite="stage-b-test-small-1-gpu")
+
 
 class TestTorchNativeAttnBackend(CustomTestCase):
     def test_mmlu(self):

diff --git a/test/srt/test_triton_attention_backend.py → ...ttention/test_triton_attention_backend.py b/test/srt/test_triton_attention_backend.py → ...ttention/test_triton_attention_backend.py
@@ -7,6 +7,7 @@
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
@@ -18,6 +19,9 @@
     run_bench_offline_throughput,
 )
 
+# Triton attention backend integration test with latency benchmark and MMLU eval
+register_cuda_ci(est_time=200, suite="stage-b-test-small-1-gpu")
+
 
 class TestTritonAttnBackend(CustomTestCase):
     def test_latency(self):

diff --git a/test/srt/test_triton_attention_kernels.py → ...ttention/test_triton_attention_kernels.py b/test/srt/test_triton_attention_kernels.py → ...ttention/test_triton_attention_kernels.py
@@ -19,8 +19,12 @@
     context_attention_fwd,
 )
 from sglang.srt.utils import get_device
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_utils import CustomTestCase
 
+# Triton attention kernel unit tests (decode, extend, prefill)
+register_cuda_ci(est_time=30, suite="stage-b-test-small-1-gpu")
+
 
 def extend_attention_fwd_torch(
     q: torch.Tensor,  # [extend_tokens, H_Q, D]

diff --git a/test/srt/test_triton_sliding_window.py → ...d/attention/test_triton_sliding_window.py b/test/srt/test_triton_sliding_window.py → ...d/attention/test_triton_sliding_window.py
@@ -4,6 +4,7 @@
 import requests
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -13,6 +14,9 @@
     popen_launch_server,
 )
 
+# Sliding window attention with Triton backend (Gemma-3 model)
+register_cuda_ci(est_time=100, suite="stage-b-test-small-1-gpu")
+
 
 class TestSlidingWindowAttentionTriton(CustomTestCase):
     """Test sliding window attention functionality with triton backend."""

diff --git a/test/srt/test_wave_attention_kernels.py → .../attention/test_wave_attention_kernels.py b/test/srt/test_wave_attention_kernels.py → .../attention/test_wave_attention_kernels.py
@@ -21,6 +21,10 @@
 from sglang.srt.layers.attention.wave_ops.prefill_attention import (
     prefill_attention_wave,
 )
+from sglang.test.ci.ci_register import register_amd_ci
+
+# Wave attention kernel unit tests (AMD only - requires wave_lang)
+register_amd_ci(est_time=60, suite="stage-a-test-1")
 
 
 class TestWaveAttention(unittest.TestCase):

diff --git a/test/registered/spec/eagle/test_eagle_infer_a.py b/test/registered/spec/eagle/test_eagle_infer_a.py
@@ -22,7 +22,7 @@
     popen_launch_server,
 )
 
-register_cuda_ci(est_time=470, suite="stage-b-test-small-1-gpu")
+register_cuda_ci(est_time=500, suite="stage-b-test-small-1-gpu")
 
 torch_dtype = torch.float16
 prefill_tolerance = 5e-2

diff --git a/test/registered/spec/eagle/test_eagle_infer_b.py b/test/registered/spec/eagle/test_eagle_infer_b.py
@@ -20,7 +20,7 @@
     run_logprob_check,
 )
 
-register_cuda_ci(est_time=473, suite="stage-b-test-small-1-gpu")
+register_cuda_ci(est_time=1100, suite="stage-b-test-small-1-gpu")
 
 
 class TestEAGLEServerBasic(EagleServerBase):

diff --git a/test/run_suite.py b/test/run_suite.py
@@ -24,6 +24,7 @@
         "stage-b-test-small-1-gpu",
         "stage-b-test-large-1-gpu",
         "stage-b-test-large-2-gpu",
+        "stage-c-test-large-4-gpu",
         "stage-b-test-4-gpu-b200",
     ],
     HWBackend.NPU: [],

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
@@ -54,20 +54,17 @@
         TestFile("dllm/test_llada2_mini.py", 520),
         TestFile("test_abort.py", 131),
         TestFile("test_chunked_prefill.py", 312),
-        TestFile("test_create_kvindices.py", 7),
         TestFile("test_deterministic.py", 228),
         TestFile("test_constrained_decoding.py", 111),
         TestFile("test_eval_fp8_accuracy.py", 250),
         TestFile("test_external_models.py", 30),
-        TestFile("test_fa3.py", 420),
         TestFile("test_flashmla.py", 230),
         TestFile("test_fp8_utils.py", 9),
         TestFile("rotary_embedding/test_mrope.py", 10),
         TestFile("test_fused_moe.py", 80),
         TestFile("test_gpt_oss_1gpu.py", 402),
         TestFile("test_harmony_parser.py", 6),
         TestFile("test_hidden_states.py", 55),
-        TestFile("test_hybrid_attn_backend.py", 379),
         TestFile("test_input_embeddings.py", 38),
         TestFile("test_io_struct.py", 8),
         TestFile("test_jinja_template_utils.py", 7),
@@ -90,8 +87,6 @@
         TestFile("test_penalty.py", 82),
         TestFile("test_priority_scheduling.py", 130),
         TestFile("test_pytorch_sampling_backend.py", 66),
-        TestFile("test_radix_attention.py", 105),
-        TestFile("test_radix_cache_unit.py", 8),
         TestFile("test_reasoning_parser.py", 5),
         TestFile("test_request_queue_validation.py", 47),
         TestFile("test_retract_decode.py", 259),
@@ -109,13 +104,8 @@
         TestFile("test_torch_compile.py", 190),
         TestFile("test_torch_compile_moe.py", 210),
         TestFile("test_triton_fused_moe.py", 12),
-        TestFile("test_torch_native_attention_backend.py", 221),
         TestFile("test_torchao.py", 103),
-        TestFile("test_triton_attention_kernels.py", 4),
-        TestFile("test_triton_attention_backend.py", 203),
-        TestFile("test_triton_attention_kernels.py", 4),
         TestFile("test_triton_moe_channel_fp8_kernel.py", 16),
-        TestFile("test_triton_sliding_window.py", 84),
         TestFile("test_utils_update_weights.py", 29),
         TestFile("test_video_utils.py", 5),
         TestFile("test_vision_chunked_prefill.py", 150),
@@ -143,7 +133,6 @@
     "per-commit-4-gpu": [
         TestFile("models/test_qwen3_next_models.py", 650),
         TestFile("test_gpt_oss_4gpu.py", 300),
-        TestFile("test_local_attn.py", 411),
         TestFile("test_multi_instance_release_memory_occupation.py", 64),
         TestFile("test_pp_single_node.py", 500),
         TestFile("test_epd_disaggregation.py", 150),
@@ -165,7 +154,6 @@
     ],
     "per-commit-4-gpu-b200": [
         TestFile("test_deepseek_v3_fp4_4gpu.py", 1500),
-        TestFile("test_flash_attention_4.py", 90),
         TestFile("test_fp8_blockwise_gemm.py", 280),
         TestFile("test_gpt_oss_4gpu.py", 700),
         TestFile("test_llama31_fp4.py", 90),
@@ -251,7 +239,6 @@
         TestFile("test_abort.py", 51),
         TestFile("test_bench_typebaseddispatcher.py", 10),
         TestFile("test_chunked_prefill.py", 312),
-        TestFile("test_create_kvindices.py", 2),
         TestFile("test_eval_fp8_accuracy.py", 303),
         TestFile("test_fused_moe.py", 30),
         TestFile("test_harmony_parser.py", 20),
@@ -266,7 +253,6 @@
         TestFile("test_page_size.py", 60),
         TestFile("test_penalty.py", 180),
         TestFile("test_pytorch_sampling_backend.py", 66),
-        TestFile("test_radix_attention.py", 105),
         TestFile("test_reasoning_parser.py", 5),
         TestFile("test_constrained_decoding.py", 120),
         TestFile("test_retract_decode.py", 450),
@@ -277,12 +263,7 @@
         TestFile("test_srt_engine.py", 261),
         TestFile("test_torch_compile.py", 169),
         # TestFile("test_torch_compile_moe.py", 210), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107
-        TestFile("test_torch_native_attention_backend.py", 123),
-        # TestFile("test_triton_attention_kernels.py", 4),
-        TestFile("test_triton_attention_backend.py", 150),
-        TestFile("test_triton_sliding_window.py", 250),
         TestFile("test_type_based_dispatcher.py", 10),
-        TestFile("test_wave_attention_kernels.py", 2),
         # Disabled temporarily
         # TestFile("test_vlm_input_format.py", 300),
         # TestFile("models/test_embedding_models.py", 73), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127