Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@
TestFile("test_original_logprobs.py", 41),
TestFile("test_page_size.py", 60),
TestFile("test_penalty.py", 82),
TestFile("test_piecewise_cuda_graph_1_gpu_a.py", 460),
TestFile("test_piecewise_cuda_graph_1_gpu_b.py", 480),
TestFile("test_priority_scheduling.py", 130),
TestFile("test_pytorch_sampling_backend.py", 66),
TestFile("test_radix_attention.py", 105),
Expand Down Expand Up @@ -117,7 +119,7 @@
TestFile("test_triton_sliding_window.py", 84),
TestFile("test_utils_update_weights.py", 29),
TestFile("test_video_utils.py", 5),
TestFile("test_vision_chunked_prefill.py", 117),
TestFile("test_vision_chunked_prefill.py", 150),
TestFile("test_vision_openai_server_a.py", 778),
TestFile("test_vlm_input_format.py", 166),
TestFile("test_modelopt_export.py", 9),
Expand All @@ -137,6 +139,7 @@
TestFile("test_dp_attention.py", 350),
TestFile("test_load_weights_from_remote_instance.py", 72),
TestFile("test_patch_torch.py", 19),
TestFile("test_piecewise_cuda_graph_2_gpu.py", 200),
TestFile("test_eagle_dp_attention.py", 200),
],
"per-commit-4-gpu": [
Expand All @@ -145,7 +148,6 @@
TestFile("test_local_attn.py", 411),
TestFile("test_multi_instance_release_memory_occupation.py", 64),
TestFile("test_pp_single_node.py", 500),
TestFile("test_piecewise_cuda_graph.py", 1260),
TestFile("test_epd_disaggregation.py", 150),
],
"per-commit-8-gpu-h200": [
Expand Down
175 changes: 175 additions & 0 deletions test/srt/test_piecewise_cuda_graph_1_gpu_a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import unittest

from sglang.srt.utils import get_device_sm, kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST_MLA,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
SimpleNamespace,
popen_launch_server,
run_bench_one_batch,
)


class TestPiecewiseCudaGraphCorrectness(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--enable-piecewise-cuda-graph"],
)

@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)

def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
)

metrics = run_eval(args)
self.assertGreaterEqual(metrics["score"], 0.65)


class TestPiecewiseCudaGraphBenchmark(CustomTestCase):

def test_latency(self):
prefill_latency, _, _ = run_bench_one_batch(
DEFAULT_MODEL_NAME_FOR_TEST,
other_args=["--enable-piecewise-cuda-graph"],
)
self.assertLess(prefill_latency, 0.015)


@unittest.skipIf(get_device_sm() < 100, "Test requires CUDA SM 100 or higher")
class TestPiecewiseCudaGraphLlama31FP4(CustomTestCase):
"""MGSM test: piecewise CUDA graph with NVFP4 Llama3.1 8B on Blackwell."""

@classmethod
def setUpClass(cls):
cls.model = "nvidia/Llama-3.1-8B-Instruct-FP4"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--enable-piecewise-cuda-graph",
"--quantization",
"modelopt_fp4",
"--mem-fraction-static",
"0.8",
],
)

@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)

def test_mgsm_accuracy(self):
num_examples = 1319
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mgsm_en",
num_examples=num_examples,
num_threads=min(num_examples, 1024),
)
metrics = run_eval(args)
print(f"MGSM Accuracy: {metrics['score']:.3f}")
self.assertGreaterEqual(metrics["score"], 0.78)


class TestPiecewiseCudaGraphQwen3MoE(CustomTestCase):
"""Test piecewise CUDA graph with Qwen3-Coder-30B-A3B-Instruct MoE model"""

@classmethod
def setUpClass(cls):
cls.model = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--enable-piecewise-cuda-graph",
"--piecewise-cuda-graph-compiler",
"eager",
],
)

@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)

def test_gsm8k_accuracy(self):
"""Test GSM8K accuracy with 8-shot setting"""
num_examples = 2000

args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mgsm_en",
num_examples=num_examples,
num_threads=min(num_examples, 1024),
)

metrics = run_eval(args)
print(f"GSM8K Accuracy: {metrics['score']:.3f}")

self.assertGreaterEqual(metrics["score"], 0.90)


class TestPiecewiseCudaGraphDeepSeek(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--enable-piecewise-cuda-graph",
"--piecewise-cuda-graph-compiler",
"eager",
"--piecewise-cuda-graph-max-tokens",
"4096", # should less than max_context_len
],
)

@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)

def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval_few_shot_gsm8k(args)
print(metrics)

self.assertGreater(metrics["accuracy"], 0.62)


if __name__ == "__main__":
unittest.main()
Loading
Loading