diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 5fb0f573c5ae..541d59b10901 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -87,6 +87,8 @@ TestFile("test_original_logprobs.py", 41), TestFile("test_page_size.py", 60), TestFile("test_penalty.py", 82), + TestFile("test_piecewise_cuda_graph_1_gpu_a.py", 460), + TestFile("test_piecewise_cuda_graph_1_gpu_b.py", 480), TestFile("test_priority_scheduling.py", 130), TestFile("test_pytorch_sampling_backend.py", 66), TestFile("test_radix_attention.py", 105), @@ -117,7 +119,7 @@ TestFile("test_triton_sliding_window.py", 84), TestFile("test_utils_update_weights.py", 29), TestFile("test_video_utils.py", 5), - TestFile("test_vision_chunked_prefill.py", 117), + TestFile("test_vision_chunked_prefill.py", 150), TestFile("test_vision_openai_server_a.py", 778), TestFile("test_vlm_input_format.py", 166), TestFile("test_modelopt_export.py", 9), @@ -137,6 +139,7 @@ TestFile("test_dp_attention.py", 350), TestFile("test_load_weights_from_remote_instance.py", 72), TestFile("test_patch_torch.py", 19), + TestFile("test_piecewise_cuda_graph_2_gpu.py", 200), TestFile("test_eagle_dp_attention.py", 200), ], "per-commit-4-gpu": [ @@ -145,7 +148,6 @@ TestFile("test_local_attn.py", 411), TestFile("test_multi_instance_release_memory_occupation.py", 64), TestFile("test_pp_single_node.py", 500), - TestFile("test_piecewise_cuda_graph.py", 1260), TestFile("test_epd_disaggregation.py", 150), ], "per-commit-8-gpu-h200": [ diff --git a/test/srt/test_piecewise_cuda_graph_1_gpu_a.py b/test/srt/test_piecewise_cuda_graph_1_gpu_a.py new file mode 100644 index 000000000000..a72759a4e420 --- /dev/null +++ b/test/srt/test_piecewise_cuda_graph_1_gpu_a.py @@ -0,0 +1,175 @@ +import unittest + +from sglang.srt.utils import get_device_sm, kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_MODEL_NAME_FOR_TEST_MLA, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + SimpleNamespace, + popen_launch_server, + run_bench_one_batch, +) + + +class TestPiecewiseCudaGraphCorrectness(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--enable-piecewise-cuda-graph"], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=64, + num_threads=32, + ) + + metrics = run_eval(args) + self.assertGreaterEqual(metrics["score"], 0.65) + + +class TestPiecewiseCudaGraphBenchmark(CustomTestCase): + + def test_latency(self): + prefill_latency, _, _ = run_bench_one_batch( + DEFAULT_MODEL_NAME_FOR_TEST, + other_args=["--enable-piecewise-cuda-graph"], + ) + self.assertLess(prefill_latency, 0.015) + + +@unittest.skipIf(get_device_sm() < 100, "Test requires CUDA SM 100 or higher") +class TestPiecewiseCudaGraphLlama31FP4(CustomTestCase): + """MGSM test: piecewise CUDA graph with NVFP4 Llama3.1 8B on Blackwell.""" + + @classmethod + def setUpClass(cls): + cls.model = "nvidia/Llama-3.1-8B-Instruct-FP4" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--enable-piecewise-cuda-graph", + "--quantization", + "modelopt_fp4", + "--mem-fraction-static", + "0.8", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mgsm_accuracy(self): + num_examples = 1319 + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=num_examples, + num_threads=min(num_examples, 1024), + ) + metrics = run_eval(args) + print(f"MGSM Accuracy: {metrics['score']:.3f}") + self.assertGreaterEqual(metrics["score"], 0.78) + + +class TestPiecewiseCudaGraphQwen3MoE(CustomTestCase): + """Test piecewise CUDA graph with Qwen3-Coder-30B-A3B-Instruct MoE model""" + + @classmethod + def setUpClass(cls): + cls.model = "Qwen/Qwen3-Coder-30B-A3B-Instruct" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--enable-piecewise-cuda-graph", + "--piecewise-cuda-graph-compiler", + "eager", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k_accuracy(self): + """Test GSM8K accuracy with 8-shot setting""" + num_examples = 2000 + + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=num_examples, + num_threads=min(num_examples, 1024), + ) + + metrics = run_eval(args) + print(f"GSM8K Accuracy: {metrics['score']:.3f}") + + self.assertGreaterEqual(metrics["score"], 0.90) + + +class TestPiecewiseCudaGraphDeepSeek(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--enable-piecewise-cuda-graph", + "--piecewise-cuda-graph-compiler", + "eager", + "--piecewise-cuda-graph-max-tokens", + "4096", # should less than max_context_len + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_piecewise_cuda_graph.py b/test/srt/test_piecewise_cuda_graph_1_gpu_b.py similarity index 50% rename from test/srt/test_piecewise_cuda_graph.py rename to test/srt/test_piecewise_cuda_graph_1_gpu_b.py index 75d80cf20663..dc2502eb4c46 100644 --- a/test/srt/test_piecewise_cuda_graph.py +++ b/test/srt/test_piecewise_cuda_graph_1_gpu_b.py @@ -4,216 +4,18 @@ from sglang import Engine from sglang.lang.chat_template import get_chat_template_by_model_path -from sglang.srt.utils import get_device_sm, kill_process_tree -from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_IMAGE_URL, - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_MODEL_NAME_FOR_TEST_MLA, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, SimpleNamespace, popen_launch_server, - run_bench_one_batch, ) -class TestPiecewiseCudaGraphCorrectness(CustomTestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--enable-piecewise-cuda-graph"], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mmlu(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mmlu", - num_examples=64, - num_threads=32, - ) - - metrics = run_eval(args) - self.assertGreaterEqual(metrics["score"], 0.65) - - -class TestPiecewiseCudaGraphBenchmark(CustomTestCase): - - def test_latency(self): - prefill_latency, _, _ = run_bench_one_batch( - DEFAULT_MODEL_NAME_FOR_TEST, - other_args=["--enable-piecewise-cuda-graph"], - ) - self.assertLess(prefill_latency, 0.015) - - -@unittest.skipIf(get_device_sm() < 100, "Test requires CUDA SM 100 or higher") -class TestPiecewiseCudaGraphLlama31FP4(CustomTestCase): - """MGSM test: piecewise CUDA graph with NVFP4 Llama3.1 8B on Blackwell.""" - - @classmethod - def setUpClass(cls): - cls.model = "nvidia/Llama-3.1-8B-Instruct-FP4" - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--enable-piecewise-cuda-graph", - "--quantization", - "modelopt_fp4", - "--mem-fraction-static", - "0.8", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mgsm_accuracy(self): - num_examples = 1319 - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=num_examples, - num_threads=min(num_examples, 1024), - ) - metrics = run_eval(args) - print(f"MGSM Accuracy: {metrics['score']:.3f}") - self.assertGreaterEqual(metrics["score"], 0.78) - - -class TestPiecewiseCudaGraphQwen3MoE(CustomTestCase): - """Test piecewise CUDA graph with Qwen3-Coder-30B-A3B-Instruct MoE model""" - - @classmethod - def setUpClass(cls): - cls.model = "Qwen/Qwen3-Coder-30B-A3B-Instruct" - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--enable-piecewise-cuda-graph", - "--piecewise-cuda-graph-compiler", - "eager", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_gsm8k_accuracy(self): - """Test GSM8K accuracy with 8-shot setting""" - num_examples = 2000 - - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=num_examples, - num_threads=min(num_examples, 1024), - ) - - metrics = run_eval(args) - print(f"GSM8K Accuracy: {metrics['score']:.3f}") - - self.assertGreaterEqual(metrics["score"], 0.90) - - -class TestPiecewiseCudaGraphDeepSeek(CustomTestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--enable-piecewise-cuda-graph", - "--piecewise-cuda-graph-compiler", - "eager", - "--piecewise-cuda-graph-max-tokens", - "4096", # should less than max_context_len - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_gsm8k(self): - args = SimpleNamespace( - num_shots=5, - data_path=None, - num_questions=200, - max_new_tokens=512, - parallel=128, - host="http://127.0.0.1", - port=int(self.base_url.split(":")[-1]), - ) - metrics = run_eval_few_shot_gsm8k(args) - print(metrics) - - self.assertGreater(metrics["accuracy"], 0.62) - - -class TestPiecewiseCudaGraphAWQ(CustomTestCase): - """Test piecewise CUDA graph with AWQ quantized model""" - - @classmethod - def setUpClass(cls): - cls.model = "Qwen/QwQ-32B-AWQ" - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--enable-piecewise-cuda-graph"], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mgsm_accuracy(self): - """Test MGSM accuracy with AWQ model""" - num_examples = 1319 - - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=num_examples, - num_threads=min(num_examples, 1024), - ) - - metrics = run_eval(args) - print(f"MGSM Accuracy: {metrics['score']:.3f}") - print(f"Output throughput: {metrics.get('throughput', 'N/A')} token/s") - - # Expected accuracy: 0.680, allow some variance - self.assertGreaterEqual(metrics["score"], 0.65) - - class TestPiecewiseCudaGraphGPTQ(CustomTestCase): @classmethod @@ -288,43 +90,6 @@ def test_mgsm_accuracy(self): print(f"MGSM Accuracy: {metrics['score']:.3f}") -class TestPiecewiseCudaGraphW8A8Int8(CustomTestCase): - """Test piecewise CUDA graph with W8A8 INT8 quantized model""" - - @classmethod - def setUpClass(cls): - cls.model = "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8" - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--enable-piecewise-cuda-graph", - "--quantization", - "w8a8_int8", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mgsm_accuracy(self): - """Test MGSM accuracy with W8A8 INT8 model""" - num_examples = 1319 - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=num_examples, - num_threads=min(num_examples, 1024), - ) - metrics = run_eval(args) - print(f"MGSM Accuracy: {metrics['score']:.3f}") - self.assertGreaterEqual(metrics["score"], 0.40) - - class TestPiecewiseCudaGraphQwen25VL(CustomTestCase): """Test piecewise CUDA graph with Qwen2.5-VL-7B-Instruct model""" @@ -443,33 +208,27 @@ def test_embedding(self): ) -class TestPiecewiseCudaGraphQwen3OmniMOE(CustomTestCase): - """Test piecewise CUDA graph with Qwen3-Omni-30B-A3B-Instruct model""" +class TestPiecewiseCudaGraphAWQ(CustomTestCase): + """Test piecewise CUDA graph with AWQ quantized model""" @classmethod def setUpClass(cls): - cls.model = "Qwen/Qwen3-Omni-30B-A3B-Instruct" + cls.model = "Qwen/QwQ-32B-AWQ" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--enable-piecewise-cuda-graph", - "--piecewise-cuda-graph-compiler", - "eager", - "--disable-radix-cache", - "--tp=4", - ], + other_args=["--enable-piecewise-cuda-graph"], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) - def test_gsm8k_accuracy(self): - """Test GSM8K accuracy with 8-shot setting""" - num_examples = 2000 + def test_mgsm_accuracy(self): + """Test MGSM accuracy with AWQ model""" + num_examples = 1319 args = SimpleNamespace( base_url=self.base_url, @@ -480,9 +239,11 @@ def test_gsm8k_accuracy(self): ) metrics = run_eval(args) - print(f"GSM8K Accuracy: {metrics['score']:.3f}") + print(f"MGSM Accuracy: {metrics['score']:.3f}") + print(f"Output throughput: {metrics.get('throughput', 'N/A')} token/s") - self.assertGreaterEqual(metrics["score"], 0.70) + # Expected accuracy: 0.680, allow some variance + self.assertGreaterEqual(metrics["score"], 0.65) if __name__ == "__main__": diff --git a/test/srt/test_piecewise_cuda_graph_2_gpu.py b/test/srt/test_piecewise_cuda_graph_2_gpu.py new file mode 100644 index 000000000000..5b6c40f426e7 --- /dev/null +++ b/test/srt/test_piecewise_cuda_graph_2_gpu.py @@ -0,0 +1,57 @@ +import unittest + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + SimpleNamespace, + popen_launch_server, +) + + +class TestPiecewiseCudaGraphQwen3OmniMOE(CustomTestCase): + """Test piecewise CUDA graph with Qwen3-Omni-30B-A3B-Instruct model""" + + @classmethod + def setUpClass(cls): + cls.model = "Qwen/Qwen3-Omni-30B-A3B-Instruct" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--enable-piecewise-cuda-graph", + "--piecewise-cuda-graph-compiler", + "eager", + "--disable-radix-cache", + "--tp=2", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k_accuracy(self): + """Test GSM8K accuracy with 8-shot setting""" + num_examples = 2000 + + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=num_examples, + num_threads=min(num_examples, 1024), + ) + + metrics = run_eval(args) + print(f"GSM8K Accuracy: {metrics['score']:.3f}") + + self.assertGreaterEqual(metrics["score"], 0.70) + + +if __name__ == "__main__": + unittest.main()