From a7bebf599738da150d996803c663a0a3bbb6e617 Mon Sep 17 00:00:00 2001 From: alisonshao Date: Fri, 5 Dec 2025 17:27:52 -0800 Subject: [PATCH 1/6] [CI] Add Mistral Large 3 Eagle nightly performance test Add nightly CI test for mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle model. The test includes: - Benchmark test for batch sizes [1, 1, 8, 16, 64] - MGSM accuracy evaluation (threshold 0.90) Eagle-specific configuration: - --speculative-moe-runner-backend flashinfer_trtllm - --kv-cache-dtype auto (to avoid low AR with FP8 kv cache) - --attention-backend trtllm_mla - --tp 8 --- .github/workflows/nightly-test-nvidia.yml | 21 ++++ .../nightly/test_mistral_large3_eagle_perf.py | 110 ++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 test/nightly/test_mistral_large3_eagle_perf.py diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml index f9dd81de9957..22b4e65805df 100644 --- a/.github/workflows/nightly-test-nvidia.yml +++ b/.github/workflows/nightly-test-nvidia.yml @@ -443,6 +443,27 @@ jobs: run: | python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_mistral_large3 + - name: Run Mistral-Large-3-Eagle nightly performance test + if: always() + timeout-minutes: 180 + env: + TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} + PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} + GPU_CONFIG: "8-gpu-b200" + SGLANG_ENABLE_JIT_DEEPGEMM: "0" + run: | + rm -rf test/performance_profiles_mistral_large3_eagle/ + cd test + IS_BLACKWELL=1 python3 nightly/test_mistral_large3_eagle_perf.py + + - name: Publish Mistral-Large-3-Eagle traces to storage repo + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + run: | + python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_mistral_large3_eagle + - name: Run DeepSeek v3.1 nightly performance test if: always() timeout-minutes: 180 diff --git a/test/nightly/test_mistral_large3_eagle_perf.py b/test/nightly/test_mistral_large3_eagle_perf.py new file mode 100644 index 000000000000..0297b4705dee --- /dev/null +++ b/test/nightly/test_mistral_large3_eagle_perf.py @@ -0,0 +1,110 @@ +import os +import unittest +from types import SimpleNamespace + +from nightly_utils import NightlyBenchmarkRunner + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + _parse_int_list_env, + popen_launch_server, +) + +register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True) + +MISTRAL_LARGE3_EAGLE_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle" +PROFILE_DIR = "performance_profiles_mistral_large3_eagle" + + +class TestNightlyMistralLarge3EaglePerformance(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Set environment variable to disable JIT DeepGemm + os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0" + + cls.model = MISTRAL_LARGE3_EAGLE_MODEL_PATH + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + # Mistral-Large-3-Eagle requires TP=8, trtllm_mla attention backend, + # speculative MoE runner backend, and auto kv cache dtype + cls.other_args = [ + "--tp", + "8", + "--attention-backend", + "trtllm_mla", + "--speculative-moe-runner-backend", + "flashinfer_trtllm", + "--kv-cache-dtype", + "auto", + "--model-loader-extra-config", + '{"enable_multithread_load": true}', + "--chat-template", + "mistral", + ] + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + + @classmethod + def tearDownClass(cls): + # Clean up environment variable + if "SGLANG_ENABLE_JIT_DEEPGEMM" in os.environ: + del os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] + + def test_bench_one_batch(self): + results, success = self.runner.run_benchmark_for_model( + model_path=self.model, + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=self.other_args, + ) + + self.runner.add_report(results) + self.runner.write_final_report() + + if not success: + raise AssertionError( + f"Benchmark failed for {self.model}. Check the logs for details." + ) + + def test_accuracy_mgsm(self): + """Run MGSM accuracy evaluation for Mistral Large 3 Eagle.""" + process = popen_launch_server( + model=self.model, + base_url=self.base_url, + other_args=self.other_args, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + + try: + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + metrics = run_eval(args) + print(f"MGSM accuracy for {self.model}: {metrics['score']}") + + # Placeholder threshold - adjust after first successful run + expected_threshold = 0.90 + self.assertGreaterEqual( + metrics["score"], + expected_threshold, + f"MGSM accuracy {metrics['score']} below threshold {expected_threshold}", + ) + finally: + kill_process_tree(process.pid) + + +if __name__ == "__main__": + unittest.main() From 218e8b5ac65620f4627a2523071729f3430bbea4 Mon Sep 17 00:00:00 2001 From: alisonshao Date: Fri, 5 Dec 2025 20:42:09 -0800 Subject: [PATCH 2/6] Fix Eagle test: use base model with Eagle as draft model for speculative decoding --- .../nightly/test_mistral_large3_eagle_perf.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/test/nightly/test_mistral_large3_eagle_perf.py b/test/nightly/test_mistral_large3_eagle_perf.py index 0297b4705dee..c7928702b7c7 100644 --- a/test/nightly/test_mistral_large3_eagle_perf.py +++ b/test/nightly/test_mistral_large3_eagle_perf.py @@ -16,6 +16,8 @@ register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True) +# Base model and Eagle draft model +MISTRAL_LARGE3_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512" MISTRAL_LARGE3_EAGLE_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle" PROFILE_DIR = "performance_profiles_mistral_large3_eagle" @@ -26,21 +28,29 @@ def setUpClass(cls): # Set environment variable to disable JIT DeepGemm os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0" - cls.model = MISTRAL_LARGE3_EAGLE_MODEL_PATH + cls.model = MISTRAL_LARGE3_MODEL_PATH cls.base_url = DEFAULT_URL_FOR_TEST cls.batch_sizes = [1, 1, 8, 16, 64] cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) - # Mistral-Large-3-Eagle requires TP=8, trtllm_mla attention backend, - # speculative MoE runner backend, and auto kv cache dtype + # Mistral-Large-3 with Eagle speculative decoding + # Eagle model is used as draft model for speculative decoding cls.other_args = [ "--tp", "8", "--attention-backend", "trtllm_mla", - "--speculative-moe-runner-backend", - "flashinfer_trtllm", + "--speculative-algorithm", + "EAGLE", + "--speculative-draft-model-path", + MISTRAL_LARGE3_EAGLE_MODEL_PATH, + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "4", + "--speculative-num-draft-tokens", + "16", "--kv-cache-dtype", "auto", "--model-loader-extra-config", @@ -76,7 +86,7 @@ def test_bench_one_batch(self): ) def test_accuracy_mgsm(self): - """Run MGSM accuracy evaluation for Mistral Large 3 Eagle.""" + """Run MGSM accuracy evaluation for Mistral Large 3 with Eagle.""" process = popen_launch_server( model=self.model, base_url=self.base_url, @@ -93,7 +103,7 @@ def test_accuracy_mgsm(self): num_threads=1024, ) metrics = run_eval(args) - print(f"MGSM accuracy for {self.model}: {metrics['score']}") + print(f"MGSM accuracy for {self.model} with Eagle: {metrics['score']}") # Placeholder threshold - adjust after first successful run expected_threshold = 0.90 From c1841e41cc5f04939a4035273a1d571e2f3d4ee5 Mon Sep 17 00:00:00 2001 From: alisonshao Date: Sat, 6 Dec 2025 01:44:58 -0800 Subject: [PATCH 3/6] Fix: use speculative-eagle-topk=1 for trtllm_mla backend compatibility --- test/nightly/test_mistral_large3_eagle_perf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/nightly/test_mistral_large3_eagle_perf.py b/test/nightly/test_mistral_large3_eagle_perf.py index c7928702b7c7..85489540da65 100644 --- a/test/nightly/test_mistral_large3_eagle_perf.py +++ b/test/nightly/test_mistral_large3_eagle_perf.py @@ -48,9 +48,9 @@ def setUpClass(cls): "--speculative-num-steps", "3", "--speculative-eagle-topk", - "4", + "1", "--speculative-num-draft-tokens", - "16", + "4", "--kv-cache-dtype", "auto", "--model-loader-extra-config", From 356eecadd6447b38d5ddf88790d44e3acfe34cb9 Mon Sep 17 00:00:00 2001 From: alisonshao Date: Sat, 6 Dec 2025 03:00:06 -0800 Subject: [PATCH 4/6] Fix: incorrect import path in mistral_large_3_eagle.py preventing model registration --- python/sglang/srt/models/mistral_large_3_eagle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/models/mistral_large_3_eagle.py b/python/sglang/srt/models/mistral_large_3_eagle.py index f136640fde78..18f401507199 100644 --- a/python/sglang/srt/models/mistral_large_3_eagle.py +++ b/python/sglang/srt/models/mistral_large_3_eagle.py @@ -4,7 +4,7 @@ from torch import nn from transformers import PretrainedConfig -from python.sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp +from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp from sglang.srt.distributed import get_pp_group from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import RowParallelLinear From 3436a76ee76db1d1d1f9ac5db40d4427e543cebb Mon Sep 17 00:00:00 2001 From: alisonshao Date: Sat, 6 Dec 2025 21:53:22 -0800 Subject: [PATCH 5/6] Fix isort import order --- python/sglang/srt/models/mistral_large_3_eagle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/models/mistral_large_3_eagle.py b/python/sglang/srt/models/mistral_large_3_eagle.py index 18f401507199..08f7271fde6c 100644 --- a/python/sglang/srt/models/mistral_large_3_eagle.py +++ b/python/sglang/srt/models/mistral_large_3_eagle.py @@ -4,8 +4,8 @@ from torch import nn from transformers import PretrainedConfig -from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp from sglang.srt.distributed import get_pp_group +from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import RowParallelLinear from sglang.srt.layers.quantization.base_config import QuantizationConfig From 95e4668846be6cd758e8a4854255f9f19da78df5 Mon Sep 17 00:00:00 2001 From: alisonshao Date: Thu, 11 Dec 2025 04:27:49 -0800 Subject: [PATCH 6/6] Consolidate Eagle test into existing Mistral Large 3 nightly test Move the Mistral Large 3 Eagle test into the same test file as the base Mistral Large 3 test, removing the need for a separate workflow step. This keeps the Eagle test within the existing nightly test job. --- .github/workflows/nightly-test-nvidia.yml | 21 +-- .../nightly/test_mistral_large3_eagle_perf.py | 120 ------------------ test/nightly/test_mistral_large3_perf.py | 99 +++++++++++++++ 3 files changed, 100 insertions(+), 140 deletions(-) delete mode 100644 test/nightly/test_mistral_large3_eagle_perf.py diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml index 84492b8e05bd..d86006f1eea0 100644 --- a/.github/workflows/nightly-test-nvidia.yml +++ b/.github/workflows/nightly-test-nvidia.yml @@ -432,6 +432,7 @@ jobs: SGLANG_ENABLE_JIT_DEEPGEMM: "0" run: | rm -rf test/performance_profiles_mistral_large3/ + rm -rf test/performance_profiles_mistral_large3_eagle/ cd test IS_BLACKWELL=1 python3 nightly/test_mistral_large3_perf.py @@ -442,26 +443,6 @@ jobs: GITHUB_RUN_NUMBER: ${{ github.run_number }} run: | python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_mistral_large3 - - - name: Run Mistral-Large-3-Eagle nightly performance test - if: always() - timeout-minutes: 180 - env: - TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }} - PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }} - GPU_CONFIG: "8-gpu-b200" - SGLANG_ENABLE_JIT_DEEPGEMM: "0" - run: | - rm -rf test/performance_profiles_mistral_large3_eagle/ - cd test - IS_BLACKWELL=1 python3 nightly/test_mistral_large3_eagle_perf.py - - - name: Publish Mistral-Large-3-Eagle traces to storage repo - env: - GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_RUN_NUMBER: ${{ github.run_number }} - run: | python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_mistral_large3_eagle - name: Run DeepSeek v3.1 nightly performance test diff --git a/test/nightly/test_mistral_large3_eagle_perf.py b/test/nightly/test_mistral_large3_eagle_perf.py deleted file mode 100644 index 85489540da65..000000000000 --- a/test/nightly/test_mistral_large3_eagle_perf.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import unittest -from types import SimpleNamespace - -from nightly_utils import NightlyBenchmarkRunner - -from sglang.srt.utils import kill_process_tree -from sglang.test.ci.ci_register import register_cuda_ci -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - _parse_int_list_env, - popen_launch_server, -) - -register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True) - -# Base model and Eagle draft model -MISTRAL_LARGE3_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512" -MISTRAL_LARGE3_EAGLE_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle" -PROFILE_DIR = "performance_profiles_mistral_large3_eagle" - - -class TestNightlyMistralLarge3EaglePerformance(unittest.TestCase): - @classmethod - def setUpClass(cls): - # Set environment variable to disable JIT DeepGemm - os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0" - - cls.model = MISTRAL_LARGE3_MODEL_PATH - cls.base_url = DEFAULT_URL_FOR_TEST - cls.batch_sizes = [1, 1, 8, 16, 64] - cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) - cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) - - # Mistral-Large-3 with Eagle speculative decoding - # Eagle model is used as draft model for speculative decoding - cls.other_args = [ - "--tp", - "8", - "--attention-backend", - "trtllm_mla", - "--speculative-algorithm", - "EAGLE", - "--speculative-draft-model-path", - MISTRAL_LARGE3_EAGLE_MODEL_PATH, - "--speculative-num-steps", - "3", - "--speculative-eagle-topk", - "1", - "--speculative-num-draft-tokens", - "4", - "--kv-cache-dtype", - "auto", - "--model-loader-extra-config", - '{"enable_multithread_load": true}', - "--chat-template", - "mistral", - ] - - cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) - cls.runner.setup_profile_directory() - - @classmethod - def tearDownClass(cls): - # Clean up environment variable - if "SGLANG_ENABLE_JIT_DEEPGEMM" in os.environ: - del os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] - - def test_bench_one_batch(self): - results, success = self.runner.run_benchmark_for_model( - model_path=self.model, - batch_sizes=self.batch_sizes, - input_lens=self.input_lens, - output_lens=self.output_lens, - other_args=self.other_args, - ) - - self.runner.add_report(results) - self.runner.write_final_report() - - if not success: - raise AssertionError( - f"Benchmark failed for {self.model}. Check the logs for details." - ) - - def test_accuracy_mgsm(self): - """Run MGSM accuracy evaluation for Mistral Large 3 with Eagle.""" - process = popen_launch_server( - model=self.model, - base_url=self.base_url, - other_args=self.other_args, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - ) - - try: - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=None, - num_threads=1024, - ) - metrics = run_eval(args) - print(f"MGSM accuracy for {self.model} with Eagle: {metrics['score']}") - - # Placeholder threshold - adjust after first successful run - expected_threshold = 0.90 - self.assertGreaterEqual( - metrics["score"], - expected_threshold, - f"MGSM accuracy {metrics['score']} below threshold {expected_threshold}", - ) - finally: - kill_process_tree(process.pid) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/nightly/test_mistral_large3_perf.py b/test/nightly/test_mistral_large3_perf.py index c4272a5f3135..3e6feb52d1c9 100644 --- a/test/nightly/test_mistral_large3_perf.py +++ b/test/nightly/test_mistral_large3_perf.py @@ -17,6 +17,7 @@ register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True) MISTRAL_LARGE3_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512" +MISTRAL_LARGE3_EAGLE_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle" PROFILE_DIR = "performance_profiles_mistral_large3" @@ -101,5 +102,103 @@ def test_accuracy_mgsm(self): kill_process_tree(process.pid) +class TestNightlyMistralLarge3EaglePerformance(unittest.TestCase): + """Test Mistral Large 3 with Eagle speculative decoding.""" + + @classmethod + def setUpClass(cls): + # Set environment variable to disable JIT DeepGemm + os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0" + + cls.model = MISTRAL_LARGE3_MODEL_PATH + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + # Mistral-Large-3 with Eagle speculative decoding + # Eagle model is used as draft model for speculative decoding + cls.other_args = [ + "--tp", + "8", + "--attention-backend", + "trtllm_mla", + "--speculative-algorithm", + "EAGLE", + "--speculative-draft-model-path", + MISTRAL_LARGE3_EAGLE_MODEL_PATH, + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "4", + "--kv-cache-dtype", + "auto", + "--model-loader-extra-config", + '{"enable_multithread_load": true}', + "--chat-template", + "mistral", + ] + + cls.runner = NightlyBenchmarkRunner( + "performance_profiles_mistral_large3_eagle", cls.__name__, cls.base_url + ) + cls.runner.setup_profile_directory() + + @classmethod + def tearDownClass(cls): + # Clean up environment variable + if "SGLANG_ENABLE_JIT_DEEPGEMM" in os.environ: + del os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] + + def test_eagle_bench_one_batch(self): + results, success = self.runner.run_benchmark_for_model( + model_path=self.model, + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=self.other_args, + ) + + self.runner.add_report(results) + self.runner.write_final_report() + + if not success: + raise AssertionError( + f"Benchmark failed for {self.model} with Eagle. Check the logs for details." + ) + + def test_eagle_accuracy_mgsm(self): + """Run MGSM accuracy evaluation for Mistral Large 3 with Eagle.""" + process = popen_launch_server( + model=self.model, + base_url=self.base_url, + other_args=self.other_args, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + + try: + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + metrics = run_eval(args) + print(f"MGSM accuracy for {self.model} with Eagle: {metrics['score']}") + + # Placeholder threshold - adjust after first successful run + expected_threshold = 0.90 + self.assertGreaterEqual( + metrics["score"], + expected_threshold, + f"MGSM accuracy {metrics['score']} below threshold {expected_threshold}", + ) + finally: + kill_process_tree(process.pid) + + if __name__ == "__main__": unittest.main()