Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,43 @@ jobs:
cd test/
python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu

stage-c-test-large-4-gpu:
needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'stage-c-test-large-4-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 4-gpu-h100
env:
RUNNER_LABELS: 4-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9

- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

- name: Run test
timeout-minutes: 30
run: |
cd test/
python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu

multimodal-gen-test-1-gpu:
needs: [check-changes, call-gate, sgl-kernel-build-wheels]
if: |
Expand Down Expand Up @@ -1402,6 +1439,7 @@ jobs:
stage-b-test-small-1-gpu,
stage-b-test-large-1-gpu,
stage-b-test-large-2-gpu,
stage-c-test-large-4-gpu,
quantization-test,
unit-test-backend-1-gpu,
unit-test-backend-2-gpu,
Expand Down
1 change: 1 addition & 0 deletions scripts/ci/slash_command_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def handle_rerun_stage(
"stage-b-test-small-1-gpu",
"stage-b-test-large-1-gpu",
"stage-b-test-large-2-gpu",
"stage-c-test-large-4-gpu",
"multimodal-gen-test-1-gpu",
"multimodal-gen-test-2-gpu",
"quantization-test",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
import torch

from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.test_utils import CustomTestCase

# Triton kernel unit test for KV indices creation
register_cuda_ci(est_time=10, suite="stage-b-test-small-1-gpu")


class TestCreateKvIndices(CustomTestCase):
@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from sglang.srt.environ import envs
from sglang.srt.utils import get_device_sm, kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
Expand All @@ -17,6 +18,10 @@
popen_launch_server,
)

# FlashAttention3 integration tests (requires SM 90+ / H100)
# Multiple test classes: FA3, FA3+MLA, FA3+SpecDecode variants
register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu")

GSM_DATASET_PATH = None

# In case of some machine lack internet connection, we can set OFFLINE_MODE to True.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
from urllib.parse import urlparse

from sglang.srt.utils import get_device_sm, kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)

# FlashAttention4 integration test (requires SM 100+ / Blackwell B200)
register_cuda_ci(est_time=200, suite="stage-b-test-large-1-gpu")


@unittest.skipIf(get_device_sm() < 100, "Test requires CUDA SM 100 or higher")
class TestFlashAttention4(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from sglang.srt.environ import envs
from sglang.srt.utils import get_device_sm, kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
Expand All @@ -17,6 +18,10 @@
popen_launch_server,
)

# Hybrid attention backend tests (FA3 prefill + FlashInfer decode, requires SM 90+ / H100)
# Multiple test classes: base, MLA, TorchCompile, SpecDecode variants
register_cuda_ci(est_time=200, suite="stage-b-test-large-1-gpu")

GSM_DATASET_PATH = None

# Default server arguments shared across all tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import requests

from sglang.srt.utils import get_device_sm, kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION,
Expand All @@ -14,6 +15,9 @@
popen_launch_server,
)

# Local attention with FA3 (requires SM 90+ / H100, tp=4)
register_cuda_ci(est_time=200, suite="stage-c-test-large-4-gpu")


@unittest.skipIf(get_device_sm() < 90, "Test requires CUDA SM 90 or higher")
class TestFlashAttention3LocalAttn(CustomTestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest

from sglang.srt.environ import envs
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.kits.radix_cache_server_kit import run_radix_attention_test
from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
Expand All @@ -12,6 +13,9 @@
popen_launch_server,
)

# RadixAttention server integration tests
register_cuda_ci(est_time=100, suite="stage-b-test-small-1-gpu")


class TestRadixCacheFCFS(CustomTestCase):
@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
python -m pytest test_radix_cache_unit.py::TestRadixCache::test_insert_basic
"""

from sglang.test.ci.ci_register import register_cuda_ci

# CPU-based unit test, runs quickly on any GPU runner
register_cuda_ci(est_time=5, suite="stage-b-test-small-1-gpu")

import time
import unittest
import unittest.mock
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""
Usage:
python3 -m unittest test_triton_attention_backend.TestTritonAttnBackend.test_mmlu
python3 -m unittest test_torch_native_attention_backend.TestTorchNativeAttnBackend.test_mmlu
"""

import unittest
from types import SimpleNamespace

from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
Expand All @@ -16,6 +17,9 @@
popen_launch_server,
)

# Torch native attention backend integration test with MMLU eval
register_cuda_ci(est_time=150, suite="stage-b-test-small-1-gpu")


class TestTorchNativeAttnBackend(CustomTestCase):
def test_mmlu(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from types import SimpleNamespace

from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
Expand All @@ -18,6 +19,9 @@
run_bench_offline_throughput,
)

# Triton attention backend integration test with latency benchmark and MMLU eval
register_cuda_ci(est_time=200, suite="stage-b-test-small-1-gpu")


class TestTritonAttnBackend(CustomTestCase):
def test_latency(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@
context_attention_fwd,
)
from sglang.srt.utils import get_device
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.test_utils import CustomTestCase

# Triton attention kernel unit tests (decode, extend, prefill)
register_cuda_ci(est_time=30, suite="stage-b-test-small-1-gpu")


def extend_attention_fwd_torch(
q: torch.Tensor, # [extend_tokens, H_Q, D]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import requests

from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
Expand All @@ -13,6 +14,9 @@
popen_launch_server,
)

# Sliding window attention with Triton backend (Gemma-3 model)
register_cuda_ci(est_time=100, suite="stage-b-test-small-1-gpu")


class TestSlidingWindowAttentionTriton(CustomTestCase):
"""Test sliding window attention functionality with triton backend."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
from sglang.srt.layers.attention.wave_ops.prefill_attention import (
prefill_attention_wave,
)
from sglang.test.ci.ci_register import register_amd_ci

# Wave attention kernel unit tests (AMD only - requires wave_lang)
register_amd_ci(est_time=60, suite="stage-a-test-1")


class TestWaveAttention(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion test/registered/spec/eagle/test_eagle_infer_a.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
popen_launch_server,
)

register_cuda_ci(est_time=470, suite="stage-b-test-small-1-gpu")
register_cuda_ci(est_time=500, suite="stage-b-test-small-1-gpu")

torch_dtype = torch.float16
prefill_tolerance = 5e-2
Expand Down
2 changes: 1 addition & 1 deletion test/registered/spec/eagle/test_eagle_infer_b.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
run_logprob_check,
)

register_cuda_ci(est_time=473, suite="stage-b-test-small-1-gpu")
register_cuda_ci(est_time=1100, suite="stage-b-test-small-1-gpu")


class TestEAGLEServerBasic(EagleServerBase):
Expand Down
1 change: 1 addition & 0 deletions test/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"stage-b-test-small-1-gpu",
"stage-b-test-large-1-gpu",
"stage-b-test-large-2-gpu",
"stage-c-test-large-4-gpu",
"stage-b-test-4-gpu-b200",
],
HWBackend.NPU: [],
Expand Down
19 changes: 0 additions & 19 deletions test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,17 @@
TestFile("dllm/test_llada2_mini.py", 520),
TestFile("test_abort.py", 131),
TestFile("test_chunked_prefill.py", 312),
TestFile("test_create_kvindices.py", 7),
TestFile("test_deterministic.py", 228),
TestFile("test_constrained_decoding.py", 111),
TestFile("test_eval_fp8_accuracy.py", 250),
TestFile("test_external_models.py", 30),
TestFile("test_fa3.py", 420),
TestFile("test_flashmla.py", 230),
TestFile("test_fp8_utils.py", 9),
TestFile("rotary_embedding/test_mrope.py", 10),
TestFile("test_fused_moe.py", 80),
TestFile("test_gpt_oss_1gpu.py", 402),
TestFile("test_harmony_parser.py", 6),
TestFile("test_hidden_states.py", 55),
TestFile("test_hybrid_attn_backend.py", 379),
TestFile("test_input_embeddings.py", 38),
TestFile("test_io_struct.py", 8),
TestFile("test_jinja_template_utils.py", 7),
Expand All @@ -90,8 +87,6 @@
TestFile("test_penalty.py", 82),
TestFile("test_priority_scheduling.py", 130),
TestFile("test_pytorch_sampling_backend.py", 66),
TestFile("test_radix_attention.py", 105),
TestFile("test_radix_cache_unit.py", 8),
TestFile("test_reasoning_parser.py", 5),
TestFile("test_request_queue_validation.py", 47),
TestFile("test_retract_decode.py", 259),
Expand All @@ -109,13 +104,8 @@
TestFile("test_torch_compile.py", 190),
TestFile("test_torch_compile_moe.py", 210),
TestFile("test_triton_fused_moe.py", 12),
TestFile("test_torch_native_attention_backend.py", 221),
TestFile("test_torchao.py", 103),
TestFile("test_triton_attention_kernels.py", 4),
TestFile("test_triton_attention_backend.py", 203),
TestFile("test_triton_attention_kernels.py", 4),
TestFile("test_triton_moe_channel_fp8_kernel.py", 16),
TestFile("test_triton_sliding_window.py", 84),
TestFile("test_utils_update_weights.py", 29),
TestFile("test_video_utils.py", 5),
TestFile("test_vision_chunked_prefill.py", 150),
Expand Down Expand Up @@ -143,7 +133,6 @@
"per-commit-4-gpu": [
TestFile("models/test_qwen3_next_models.py", 650),
TestFile("test_gpt_oss_4gpu.py", 300),
TestFile("test_local_attn.py", 411),
TestFile("test_multi_instance_release_memory_occupation.py", 64),
TestFile("test_pp_single_node.py", 500),
TestFile("test_epd_disaggregation.py", 150),
Expand All @@ -165,7 +154,6 @@
],
"per-commit-4-gpu-b200": [
TestFile("test_deepseek_v3_fp4_4gpu.py", 1500),
TestFile("test_flash_attention_4.py", 90),
TestFile("test_fp8_blockwise_gemm.py", 280),
TestFile("test_gpt_oss_4gpu.py", 700),
TestFile("test_llama31_fp4.py", 90),
Expand Down Expand Up @@ -251,7 +239,6 @@
TestFile("test_abort.py", 51),
TestFile("test_bench_typebaseddispatcher.py", 10),
TestFile("test_chunked_prefill.py", 312),
TestFile("test_create_kvindices.py", 2),
TestFile("test_eval_fp8_accuracy.py", 303),
TestFile("test_fused_moe.py", 30),
TestFile("test_harmony_parser.py", 20),
Expand All @@ -266,7 +253,6 @@
TestFile("test_page_size.py", 60),
TestFile("test_penalty.py", 180),
TestFile("test_pytorch_sampling_backend.py", 66),
TestFile("test_radix_attention.py", 105),
TestFile("test_reasoning_parser.py", 5),
TestFile("test_constrained_decoding.py", 120),
TestFile("test_retract_decode.py", 450),
Expand All @@ -277,12 +263,7 @@
TestFile("test_srt_engine.py", 261),
TestFile("test_torch_compile.py", 169),
# TestFile("test_torch_compile_moe.py", 210), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107
TestFile("test_torch_native_attention_backend.py", 123),
# TestFile("test_triton_attention_kernels.py", 4),
TestFile("test_triton_attention_backend.py", 150),
TestFile("test_triton_sliding_window.py", 250),
TestFile("test_type_based_dispatcher.py", 10),
TestFile("test_wave_attention_kernels.py", 2),
# Disabled temporarily
# TestFile("test_vlm_input_format.py", 300),
# TestFile("models/test_embedding_models.py", 73), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127
Expand Down
Loading