diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 0c88a5f12a98..4fc724e3cbbd 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -522,6 +522,43 @@ jobs: cd test/ python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu + stage-c-test-large-4-gpu: + needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, sgl-kernel-build-wheels] + if: | + always() && + ( + (inputs.target_stage == 'stage-c-test-large-4-gpu') || + ( + !inputs.target_stage && + (github.event_name == 'schedule' || (!failure() && !cancelled())) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + runs-on: 4-gpu-h100 + env: + RUNNER_LABELS: 4-gpu-h100 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download artifacts + if: needs.check-changes.outputs.sgl_kernel == 'true' + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-python3.10-cuda12.9 + + - name: Install dependencies + run: | + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh + + - name: Run test + timeout-minutes: 30 + run: | + cd test/ + python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu + multimodal-gen-test-1-gpu: needs: [check-changes, call-gate, sgl-kernel-build-wheels] if: | @@ -1402,6 +1439,7 @@ jobs: stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, + stage-c-test-large-4-gpu, quantization-test, unit-test-backend-1-gpu, unit-test-backend-2-gpu, diff --git a/scripts/ci/slash_command_handler.py b/scripts/ci/slash_command_handler.py index 07a6ef1ef848..ceef581984cc 100644 --- a/scripts/ci/slash_command_handler.py +++ b/scripts/ci/slash_command_handler.py @@ -147,6 +147,7 @@ def handle_rerun_stage( "stage-b-test-small-1-gpu", "stage-b-test-large-1-gpu", "stage-b-test-large-2-gpu", + "stage-c-test-large-4-gpu", "multimodal-gen-test-1-gpu", "multimodal-gen-test-2-gpu", "quantization-test", diff --git a/test/srt/test_create_kvindices.py b/test/registered/attention/test_create_kvindices.py similarity index 93% rename from test/srt/test_create_kvindices.py rename to test/registered/attention/test_create_kvindices.py index 7e63fd823f37..0642aa29cca5 100644 --- a/test/srt/test_create_kvindices.py +++ b/test/registered/attention/test_create_kvindices.py @@ -4,8 +4,12 @@ import torch from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_utils import CustomTestCase +# Triton kernel unit test for KV indices creation +register_cuda_ci(est_time=10, suite="stage-b-test-small-1-gpu") + class TestCreateKvIndices(CustomTestCase): @classmethod diff --git a/test/srt/test_fa3.py b/test/registered/attention/test_fa3.py similarity index 97% rename from test/srt/test_fa3.py rename to test/registered/attention/test_fa3.py index 739b143fa733..24c5bb4c0779 100644 --- a/test/srt/test_fa3.py +++ b/test/registered/attention/test_fa3.py @@ -5,6 +5,7 @@ from sglang.srt.environ import envs from sglang.srt.utils import get_device_sm, kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -17,6 +18,10 @@ popen_launch_server, ) +# FlashAttention3 integration tests (requires SM 90+ / H100) +# Multiple test classes: FA3, FA3+MLA, FA3+SpecDecode variants +register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu") + GSM_DATASET_PATH = None # In case of some machine lack internet connection, we can set OFFLINE_MODE to True. diff --git a/test/srt/test_flash_attention_4.py b/test/registered/attention/test_flash_attention_4.py similarity index 89% rename from test/srt/test_flash_attention_4.py rename to test/registered/attention/test_flash_attention_4.py index 9d81ccd85b49..3da4a8e32e58 100644 --- a/test/srt/test_flash_attention_4.py +++ b/test/registered/attention/test_flash_attention_4.py @@ -3,6 +3,7 @@ from urllib.parse import urlparse from sglang.srt.utils import get_device_sm, kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -10,6 +11,9 @@ popen_launch_server, ) +# FlashAttention4 integration test (requires SM 100+ / Blackwell B200) +register_cuda_ci(est_time=200, suite="stage-b-test-large-1-gpu") + @unittest.skipIf(get_device_sm() < 100, "Test requires CUDA SM 100 or higher") class TestFlashAttention4(unittest.TestCase): diff --git a/test/srt/test_hybrid_attn_backend.py b/test/registered/attention/test_hybrid_attn_backend.py similarity index 94% rename from test/srt/test_hybrid_attn_backend.py rename to test/registered/attention/test_hybrid_attn_backend.py index a16203ae73e8..42f2b5946cd8 100644 --- a/test/srt/test_hybrid_attn_backend.py +++ b/test/registered/attention/test_hybrid_attn_backend.py @@ -5,6 +5,7 @@ from sglang.srt.environ import envs from sglang.srt.utils import get_device_sm, kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.test_utils import ( DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, @@ -17,6 +18,10 @@ popen_launch_server, ) +# Hybrid attention backend tests (FA3 prefill + FlashInfer decode, requires SM 90+ / H100) +# Multiple test classes: base, MLA, TorchCompile, SpecDecode variants +register_cuda_ci(est_time=200, suite="stage-b-test-large-1-gpu") + GSM_DATASET_PATH = None # Default server arguments shared across all tests diff --git a/test/srt/test_local_attn.py b/test/registered/attention/test_local_attn.py similarity index 91% rename from test/srt/test_local_attn.py rename to test/registered/attention/test_local_attn.py index 923ffdd5e0d8..25dc0959d27f 100644 --- a/test/srt/test_local_attn.py +++ b/test/registered/attention/test_local_attn.py @@ -5,6 +5,7 @@ import requests from sglang.srt.utils import get_device_sm, kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION, @@ -14,6 +15,9 @@ popen_launch_server, ) +# Local attention with FA3 (requires SM 90+ / H100, tp=4) +register_cuda_ci(est_time=200, suite="stage-c-test-large-4-gpu") + @unittest.skipIf(get_device_sm() < 90, "Test requires CUDA SM 90 or higher") class TestFlashAttention3LocalAttn(CustomTestCase): diff --git a/test/srt/test_radix_attention.py b/test/registered/attention/test_radix_attention.py similarity index 93% rename from test/srt/test_radix_attention.py rename to test/registered/attention/test_radix_attention.py index 2f501e882c87..c173d75bdb09 100644 --- a/test/srt/test_radix_attention.py +++ b/test/registered/attention/test_radix_attention.py @@ -1,6 +1,7 @@ import unittest from sglang.srt.environ import envs +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.kits.radix_cache_server_kit import run_radix_attention_test from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, @@ -12,6 +13,9 @@ popen_launch_server, ) +# RadixAttention server integration tests +register_cuda_ci(est_time=100, suite="stage-b-test-small-1-gpu") + class TestRadixCacheFCFS(CustomTestCase): @classmethod diff --git a/test/srt/test_radix_cache_unit.py b/test/registered/attention/test_radix_cache_unit.py similarity index 99% rename from test/srt/test_radix_cache_unit.py rename to test/registered/attention/test_radix_cache_unit.py index b41e2e8823de..cb30a1e507f5 100644 --- a/test/srt/test_radix_cache_unit.py +++ b/test/registered/attention/test_radix_cache_unit.py @@ -17,6 +17,11 @@ python -m pytest test_radix_cache_unit.py::TestRadixCache::test_insert_basic """ +from sglang.test.ci.ci_register import register_cuda_ci + +# CPU-based unit test, runs quickly on any GPU runner +register_cuda_ci(est_time=5, suite="stage-b-test-small-1-gpu") + import time import unittest import unittest.mock diff --git a/test/srt/test_torch_native_attention_backend.py b/test/registered/attention/test_torch_native_attention_backend.py similarity index 80% rename from test/srt/test_torch_native_attention_backend.py rename to test/registered/attention/test_torch_native_attention_backend.py index 5bf012cca7d1..e6c6a95468ac 100644 --- a/test/srt/test_torch_native_attention_backend.py +++ b/test/registered/attention/test_torch_native_attention_backend.py @@ -1,12 +1,13 @@ """ Usage: -python3 -m unittest test_triton_attention_backend.TestTritonAttnBackend.test_mmlu +python3 -m unittest test_torch_native_attention_backend.TestTorchNativeAttnBackend.test_mmlu """ import unittest from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -16,6 +17,9 @@ popen_launch_server, ) +# Torch native attention backend integration test with MMLU eval +register_cuda_ci(est_time=150, suite="stage-b-test-small-1-gpu") + class TestTorchNativeAttnBackend(CustomTestCase): def test_mmlu(self): diff --git a/test/srt/test_triton_attention_backend.py b/test/registered/attention/test_triton_attention_backend.py similarity index 89% rename from test/srt/test_triton_attention_backend.py rename to test/registered/attention/test_triton_attention_backend.py index 05725301ad4a..d19bb61286be 100644 --- a/test/srt/test_triton_attention_backend.py +++ b/test/registered/attention/test_triton_attention_backend.py @@ -7,6 +7,7 @@ from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -18,6 +19,9 @@ run_bench_offline_throughput, ) +# Triton attention backend integration test with latency benchmark and MMLU eval +register_cuda_ci(est_time=200, suite="stage-b-test-small-1-gpu") + class TestTritonAttnBackend(CustomTestCase): def test_latency(self): diff --git a/test/srt/test_triton_attention_kernels.py b/test/registered/attention/test_triton_attention_kernels.py similarity index 99% rename from test/srt/test_triton_attention_kernels.py rename to test/registered/attention/test_triton_attention_kernels.py index 5a7fb5472971..0690011bd34b 100644 --- a/test/srt/test_triton_attention_kernels.py +++ b/test/registered/attention/test_triton_attention_kernels.py @@ -19,8 +19,12 @@ context_attention_fwd, ) from sglang.srt.utils import get_device +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_utils import CustomTestCase +# Triton attention kernel unit tests (decode, extend, prefill) +register_cuda_ci(est_time=30, suite="stage-b-test-small-1-gpu") + def extend_attention_fwd_torch( q: torch.Tensor, # [extend_tokens, H_Q, D] diff --git a/test/srt/test_triton_sliding_window.py b/test/registered/attention/test_triton_sliding_window.py similarity index 95% rename from test/srt/test_triton_sliding_window.py rename to test/registered/attention/test_triton_sliding_window.py index f2b422c5ee18..9c43b0cd5267 100644 --- a/test/srt/test_triton_sliding_window.py +++ b/test/registered/attention/test_triton_sliding_window.py @@ -4,6 +4,7 @@ import requests from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -13,6 +14,9 @@ popen_launch_server, ) +# Sliding window attention with Triton backend (Gemma-3 model) +register_cuda_ci(est_time=100, suite="stage-b-test-small-1-gpu") + class TestSlidingWindowAttentionTriton(CustomTestCase): """Test sliding window attention functionality with triton backend.""" diff --git a/test/srt/test_wave_attention_kernels.py b/test/registered/attention/test_wave_attention_kernels.py similarity index 98% rename from test/srt/test_wave_attention_kernels.py rename to test/registered/attention/test_wave_attention_kernels.py index d4c2ff8e5a55..9dc22651e839 100644 --- a/test/srt/test_wave_attention_kernels.py +++ b/test/registered/attention/test_wave_attention_kernels.py @@ -21,6 +21,10 @@ from sglang.srt.layers.attention.wave_ops.prefill_attention import ( prefill_attention_wave, ) +from sglang.test.ci.ci_register import register_amd_ci + +# Wave attention kernel unit tests (AMD only - requires wave_lang) +register_amd_ci(est_time=60, suite="stage-a-test-1") class TestWaveAttention(unittest.TestCase): diff --git a/test/registered/spec/eagle/test_eagle_infer_a.py b/test/registered/spec/eagle/test_eagle_infer_a.py index e70a92ece652..270dbbbd9490 100644 --- a/test/registered/spec/eagle/test_eagle_infer_a.py +++ b/test/registered/spec/eagle/test_eagle_infer_a.py @@ -22,7 +22,7 @@ popen_launch_server, ) -register_cuda_ci(est_time=470, suite="stage-b-test-small-1-gpu") +register_cuda_ci(est_time=500, suite="stage-b-test-small-1-gpu") torch_dtype = torch.float16 prefill_tolerance = 5e-2 diff --git a/test/registered/spec/eagle/test_eagle_infer_b.py b/test/registered/spec/eagle/test_eagle_infer_b.py index 10c4f677418a..e4d3ebf407ce 100644 --- a/test/registered/spec/eagle/test_eagle_infer_b.py +++ b/test/registered/spec/eagle/test_eagle_infer_b.py @@ -20,7 +20,7 @@ run_logprob_check, ) -register_cuda_ci(est_time=473, suite="stage-b-test-small-1-gpu") +register_cuda_ci(est_time=1100, suite="stage-b-test-small-1-gpu") class TestEAGLEServerBasic(EagleServerBase): diff --git a/test/run_suite.py b/test/run_suite.py index 87ed5aed9076..97a8958e6d89 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -24,6 +24,7 @@ "stage-b-test-small-1-gpu", "stage-b-test-large-1-gpu", "stage-b-test-large-2-gpu", + "stage-c-test-large-4-gpu", "stage-b-test-4-gpu-b200", ], HWBackend.NPU: [], diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 34be3572aac8..7785898130e8 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -54,12 +54,10 @@ TestFile("dllm/test_llada2_mini.py", 520), TestFile("test_abort.py", 131), TestFile("test_chunked_prefill.py", 312), - TestFile("test_create_kvindices.py", 7), TestFile("test_deterministic.py", 228), TestFile("test_constrained_decoding.py", 111), TestFile("test_eval_fp8_accuracy.py", 250), TestFile("test_external_models.py", 30), - TestFile("test_fa3.py", 420), TestFile("test_flashmla.py", 230), TestFile("test_fp8_utils.py", 9), TestFile("rotary_embedding/test_mrope.py", 10), @@ -67,7 +65,6 @@ TestFile("test_gpt_oss_1gpu.py", 402), TestFile("test_harmony_parser.py", 6), TestFile("test_hidden_states.py", 55), - TestFile("test_hybrid_attn_backend.py", 379), TestFile("test_input_embeddings.py", 38), TestFile("test_io_struct.py", 8), TestFile("test_jinja_template_utils.py", 7), @@ -90,8 +87,6 @@ TestFile("test_penalty.py", 82), TestFile("test_priority_scheduling.py", 130), TestFile("test_pytorch_sampling_backend.py", 66), - TestFile("test_radix_attention.py", 105), - TestFile("test_radix_cache_unit.py", 8), TestFile("test_reasoning_parser.py", 5), TestFile("test_request_queue_validation.py", 47), TestFile("test_retract_decode.py", 259), @@ -109,13 +104,8 @@ TestFile("test_torch_compile.py", 190), TestFile("test_torch_compile_moe.py", 210), TestFile("test_triton_fused_moe.py", 12), - TestFile("test_torch_native_attention_backend.py", 221), TestFile("test_torchao.py", 103), - TestFile("test_triton_attention_kernels.py", 4), - TestFile("test_triton_attention_backend.py", 203), - TestFile("test_triton_attention_kernels.py", 4), TestFile("test_triton_moe_channel_fp8_kernel.py", 16), - TestFile("test_triton_sliding_window.py", 84), TestFile("test_utils_update_weights.py", 29), TestFile("test_video_utils.py", 5), TestFile("test_vision_chunked_prefill.py", 150), @@ -143,7 +133,6 @@ "per-commit-4-gpu": [ TestFile("models/test_qwen3_next_models.py", 650), TestFile("test_gpt_oss_4gpu.py", 300), - TestFile("test_local_attn.py", 411), TestFile("test_multi_instance_release_memory_occupation.py", 64), TestFile("test_pp_single_node.py", 500), TestFile("test_epd_disaggregation.py", 150), @@ -165,7 +154,6 @@ ], "per-commit-4-gpu-b200": [ TestFile("test_deepseek_v3_fp4_4gpu.py", 1500), - TestFile("test_flash_attention_4.py", 90), TestFile("test_fp8_blockwise_gemm.py", 280), TestFile("test_gpt_oss_4gpu.py", 700), TestFile("test_llama31_fp4.py", 90), @@ -251,7 +239,6 @@ TestFile("test_abort.py", 51), TestFile("test_bench_typebaseddispatcher.py", 10), TestFile("test_chunked_prefill.py", 312), - TestFile("test_create_kvindices.py", 2), TestFile("test_eval_fp8_accuracy.py", 303), TestFile("test_fused_moe.py", 30), TestFile("test_harmony_parser.py", 20), @@ -266,7 +253,6 @@ TestFile("test_page_size.py", 60), TestFile("test_penalty.py", 180), TestFile("test_pytorch_sampling_backend.py", 66), - TestFile("test_radix_attention.py", 105), TestFile("test_reasoning_parser.py", 5), TestFile("test_constrained_decoding.py", 120), TestFile("test_retract_decode.py", 450), @@ -277,12 +263,7 @@ TestFile("test_srt_engine.py", 261), TestFile("test_torch_compile.py", 169), # TestFile("test_torch_compile_moe.py", 210), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107 - TestFile("test_torch_native_attention_backend.py", 123), - # TestFile("test_triton_attention_kernels.py", 4), - TestFile("test_triton_attention_backend.py", 150), - TestFile("test_triton_sliding_window.py", 250), TestFile("test_type_based_dispatcher.py", 10), - TestFile("test_wave_attention_kernels.py", 2), # Disabled temporarily # TestFile("test_vlm_input_format.py", 300), # TestFile("models/test_embedding_models.py", 73), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127