From a7bebf599738da150d996803c663a0a3bbb6e617 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 5 Dec 2025 17:27:52 -0800
Subject: [PATCH 1/6] [CI] Add Mistral Large 3 Eagle nightly performance test

Add nightly CI test for mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle model.

The test includes:
- Benchmark test for batch sizes [1, 1, 8, 16, 64]
- MGSM accuracy evaluation (threshold 0.90)

Eagle-specific configuration:
- --speculative-moe-runner-backend flashinfer_trtllm
- --kv-cache-dtype auto (to avoid low AR with FP8 kv cache)
- --attention-backend trtllm_mla
- --tp 8
---
 .github/workflows/nightly-test-nvidia.yml     |  21 ++++
 .../nightly/test_mistral_large3_eagle_perf.py | 110 ++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 test/nightly/test_mistral_large3_eagle_perf.py

diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml
index f9dd81de9957..22b4e65805df 100644
--- a/.github/workflows/nightly-test-nvidia.yml
+++ b/.github/workflows/nightly-test-nvidia.yml
@@ -443,6 +443,27 @@ jobs:
         run: |
           python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_mistral_large3
 
+      - name: Run Mistral-Large-3-Eagle nightly performance test
+        if: always()
+        timeout-minutes: 180
+        env:
+          TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
+          PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
+          GPU_CONFIG: "8-gpu-b200"
+          SGLANG_ENABLE_JIT_DEEPGEMM: "0"
+        run: |
+          rm -rf test/performance_profiles_mistral_large3_eagle/
+          cd test
+          IS_BLACKWELL=1 python3 nightly/test_mistral_large3_eagle_perf.py
+
+      - name: Publish Mistral-Large-3-Eagle traces to storage repo
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+        run: |
+          python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_mistral_large3_eagle
+
       - name: Run DeepSeek v3.1 nightly performance test
         if: always()
         timeout-minutes: 180
diff --git a/test/nightly/test_mistral_large3_eagle_perf.py b/test/nightly/test_mistral_large3_eagle_perf.py
new file mode 100644
index 000000000000..0297b4705dee
--- /dev/null
+++ b/test/nightly/test_mistral_large3_eagle_perf.py
@@ -0,0 +1,110 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from nightly_utils import NightlyBenchmarkRunner
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    _parse_int_list_env,
+    popen_launch_server,
+)
+
+register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True)
+
+MISTRAL_LARGE3_EAGLE_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle"
+PROFILE_DIR = "performance_profiles_mistral_large3_eagle"
+
+
+class TestNightlyMistralLarge3EaglePerformance(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Set environment variable to disable JIT DeepGemm
+        os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0"
+
+        cls.model = MISTRAL_LARGE3_EAGLE_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.batch_sizes = [1, 1, 8, 16, 64]
+        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
+        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
+
+        # Mistral-Large-3-Eagle requires TP=8, trtllm_mla attention backend,
+        # speculative MoE runner backend, and auto kv cache dtype
+        cls.other_args = [
+            "--tp",
+            "8",
+            "--attention-backend",
+            "trtllm_mla",
+            "--speculative-moe-runner-backend",
+            "flashinfer_trtllm",
+            "--kv-cache-dtype",
+            "auto",
+            "--model-loader-extra-config",
+            '{"enable_multithread_load": true}',
+            "--chat-template",
+            "mistral",
+        ]
+
+        cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url)
+        cls.runner.setup_profile_directory()
+
+    @classmethod
+    def tearDownClass(cls):
+        # Clean up environment variable
+        if "SGLANG_ENABLE_JIT_DEEPGEMM" in os.environ:
+            del os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"]
+
+    def test_bench_one_batch(self):
+        results, success = self.runner.run_benchmark_for_model(
+            model_path=self.model,
+            batch_sizes=self.batch_sizes,
+            input_lens=self.input_lens,
+            output_lens=self.output_lens,
+            other_args=self.other_args,
+        )
+
+        self.runner.add_report(results)
+        self.runner.write_final_report()
+
+        if not success:
+            raise AssertionError(
+                f"Benchmark failed for {self.model}. Check the logs for details."
+            )
+
+    def test_accuracy_mgsm(self):
+        """Run MGSM accuracy evaluation for Mistral Large 3 Eagle."""
+        process = popen_launch_server(
+            model=self.model,
+            base_url=self.base_url,
+            other_args=self.other_args,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=self.base_url,
+                model=self.model,
+                eval_name="mgsm_en",
+                num_examples=None,
+                num_threads=1024,
+            )
+            metrics = run_eval(args)
+            print(f"MGSM accuracy for {self.model}: {metrics['score']}")
+
+            # Placeholder threshold - adjust after first successful run
+            expected_threshold = 0.90
+            self.assertGreaterEqual(
+                metrics["score"],
+                expected_threshold,
+                f"MGSM accuracy {metrics['score']} below threshold {expected_threshold}",
+            )
+        finally:
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 218e8b5ac65620f4627a2523071729f3430bbea4 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Fri, 5 Dec 2025 20:42:09 -0800
Subject: [PATCH 2/6] Fix Eagle test: use base model with Eagle as draft model
 for speculative decoding

---
 .../nightly/test_mistral_large3_eagle_perf.py | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/test/nightly/test_mistral_large3_eagle_perf.py b/test/nightly/test_mistral_large3_eagle_perf.py
index 0297b4705dee..c7928702b7c7 100644
--- a/test/nightly/test_mistral_large3_eagle_perf.py
+++ b/test/nightly/test_mistral_large3_eagle_perf.py
@@ -16,6 +16,8 @@
 
 register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True)
 
+# Base model and Eagle draft model
+MISTRAL_LARGE3_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512"
 MISTRAL_LARGE3_EAGLE_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle"
 PROFILE_DIR = "performance_profiles_mistral_large3_eagle"
 
@@ -26,21 +28,29 @@ def setUpClass(cls):
         # Set environment variable to disable JIT DeepGemm
         os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0"
 
-        cls.model = MISTRAL_LARGE3_EAGLE_MODEL_PATH
+        cls.model = MISTRAL_LARGE3_MODEL_PATH
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.batch_sizes = [1, 1, 8, 16, 64]
         cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
         cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
 
-        # Mistral-Large-3-Eagle requires TP=8, trtllm_mla attention backend,
-        # speculative MoE runner backend, and auto kv cache dtype
+        # Mistral-Large-3 with Eagle speculative decoding
+        # Eagle model is used as draft model for speculative decoding
         cls.other_args = [
             "--tp",
             "8",
             "--attention-backend",
             "trtllm_mla",
-            "--speculative-moe-runner-backend",
-            "flashinfer_trtllm",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-draft-model-path",
+            MISTRAL_LARGE3_EAGLE_MODEL_PATH,
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "4",
+            "--speculative-num-draft-tokens",
+            "16",
             "--kv-cache-dtype",
             "auto",
             "--model-loader-extra-config",
@@ -76,7 +86,7 @@ def test_bench_one_batch(self):
             )
 
     def test_accuracy_mgsm(self):
-        """Run MGSM accuracy evaluation for Mistral Large 3 Eagle."""
+        """Run MGSM accuracy evaluation for Mistral Large 3 with Eagle."""
         process = popen_launch_server(
             model=self.model,
             base_url=self.base_url,
@@ -93,7 +103,7 @@ def test_accuracy_mgsm(self):
                 num_threads=1024,
             )
             metrics = run_eval(args)
-            print(f"MGSM accuracy for {self.model}: {metrics['score']}")
+            print(f"MGSM accuracy for {self.model} with Eagle: {metrics['score']}")
 
             # Placeholder threshold - adjust after first successful run
             expected_threshold = 0.90

From c1841e41cc5f04939a4035273a1d571e2f3d4ee5 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Sat, 6 Dec 2025 01:44:58 -0800
Subject: [PATCH 3/6] Fix: use speculative-eagle-topk=1 for trtllm_mla backend
 compatibility

---
 test/nightly/test_mistral_large3_eagle_perf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/nightly/test_mistral_large3_eagle_perf.py b/test/nightly/test_mistral_large3_eagle_perf.py
index c7928702b7c7..85489540da65 100644
--- a/test/nightly/test_mistral_large3_eagle_perf.py
+++ b/test/nightly/test_mistral_large3_eagle_perf.py
@@ -48,9 +48,9 @@ def setUpClass(cls):
             "--speculative-num-steps",
             "3",
             "--speculative-eagle-topk",
-            "4",
+            "1",
             "--speculative-num-draft-tokens",
-            "16",
+            "4",
             "--kv-cache-dtype",
             "auto",
             "--model-loader-extra-config",

From 356eecadd6447b38d5ddf88790d44e3acfe34cb9 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Sat, 6 Dec 2025 03:00:06 -0800
Subject: [PATCH 4/6] Fix: incorrect import path in mistral_large_3_eagle.py
 preventing model registration

---
 python/sglang/srt/models/mistral_large_3_eagle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/models/mistral_large_3_eagle.py b/python/sglang/srt/models/mistral_large_3_eagle.py
index f136640fde78..18f401507199 100644
--- a/python/sglang/srt/models/mistral_large_3_eagle.py
+++ b/python/sglang/srt/models/mistral_large_3_eagle.py
@@ -4,7 +4,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from python.sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
+from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
 from sglang.srt.distributed import get_pp_group
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import RowParallelLinear

From 3436a76ee76db1d1d1f9ac5db40d4427e543cebb Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Sat, 6 Dec 2025 21:53:22 -0800
Subject: [PATCH 5/6] Fix isort import order

---
 python/sglang/srt/models/mistral_large_3_eagle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/models/mistral_large_3_eagle.py b/python/sglang/srt/models/mistral_large_3_eagle.py
index 18f401507199..08f7271fde6c 100644
--- a/python/sglang/srt/models/mistral_large_3_eagle.py
+++ b/python/sglang/srt/models/mistral_large_3_eagle.py
@@ -4,8 +4,8 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
 from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import RowParallelLinear
 from sglang.srt.layers.quantization.base_config import QuantizationConfig

From 95e4668846be6cd758e8a4854255f9f19da78df5 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Thu, 11 Dec 2025 04:27:49 -0800
Subject: [PATCH 6/6] Consolidate Eagle test into existing Mistral Large 3
 nightly test

Move the Mistral Large 3 Eagle test into the same test file as the base
Mistral Large 3 test, removing the need for a separate workflow step.
This keeps the Eagle test within the existing nightly test job.
---
 .github/workflows/nightly-test-nvidia.yml     |  21 +--
 .../nightly/test_mistral_large3_eagle_perf.py | 120 ------------------
 test/nightly/test_mistral_large3_perf.py      |  99 +++++++++++++++
 3 files changed, 100 insertions(+), 140 deletions(-)
 delete mode 100644 test/nightly/test_mistral_large3_eagle_perf.py

diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml
index 84492b8e05bd..d86006f1eea0 100644
--- a/.github/workflows/nightly-test-nvidia.yml
+++ b/.github/workflows/nightly-test-nvidia.yml
@@ -432,6 +432,7 @@ jobs:
           SGLANG_ENABLE_JIT_DEEPGEMM: "0"
         run: |
           rm -rf test/performance_profiles_mistral_large3/
+          rm -rf test/performance_profiles_mistral_large3_eagle/
           cd test
           IS_BLACKWELL=1 python3 nightly/test_mistral_large3_perf.py
 
@@ -442,26 +443,6 @@ jobs:
           GITHUB_RUN_NUMBER: ${{ github.run_number }}
         run: |
           python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_mistral_large3
-
-      - name: Run Mistral-Large-3-Eagle nightly performance test
-        if: always()
-        timeout-minutes: 180
-        env:
-          TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
-          PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
-          GPU_CONFIG: "8-gpu-b200"
-          SGLANG_ENABLE_JIT_DEEPGEMM: "0"
-        run: |
-          rm -rf test/performance_profiles_mistral_large3_eagle/
-          cd test
-          IS_BLACKWELL=1 python3 nightly/test_mistral_large3_eagle_perf.py
-
-      - name: Publish Mistral-Large-3-Eagle traces to storage repo
-        env:
-          GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
-          GITHUB_RUN_ID: ${{ github.run_id }}
-          GITHUB_RUN_NUMBER: ${{ github.run_number }}
-        run: |
           python3 scripts/ci/publish_traces.py --traces-dir test/performance_profiles_mistral_large3_eagle
 
       - name: Run DeepSeek v3.1 nightly performance test
diff --git a/test/nightly/test_mistral_large3_eagle_perf.py b/test/nightly/test_mistral_large3_eagle_perf.py
deleted file mode 100644
index 85489540da65..000000000000
--- a/test/nightly/test_mistral_large3_eagle_perf.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import os
-import unittest
-from types import SimpleNamespace
-
-from nightly_utils import NightlyBenchmarkRunner
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.ci.ci_register import register_cuda_ci
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    _parse_int_list_env,
-    popen_launch_server,
-)
-
-register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True)
-
-# Base model and Eagle draft model
-MISTRAL_LARGE3_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512"
-MISTRAL_LARGE3_EAGLE_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle"
-PROFILE_DIR = "performance_profiles_mistral_large3_eagle"
-
-
-class TestNightlyMistralLarge3EaglePerformance(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        # Set environment variable to disable JIT DeepGemm
-        os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0"
-
-        cls.model = MISTRAL_LARGE3_MODEL_PATH
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.batch_sizes = [1, 1, 8, 16, 64]
-        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
-        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
-
-        # Mistral-Large-3 with Eagle speculative decoding
-        # Eagle model is used as draft model for speculative decoding
-        cls.other_args = [
-            "--tp",
-            "8",
-            "--attention-backend",
-            "trtllm_mla",
-            "--speculative-algorithm",
-            "EAGLE",
-            "--speculative-draft-model-path",
-            MISTRAL_LARGE3_EAGLE_MODEL_PATH,
-            "--speculative-num-steps",
-            "3",
-            "--speculative-eagle-topk",
-            "1",
-            "--speculative-num-draft-tokens",
-            "4",
-            "--kv-cache-dtype",
-            "auto",
-            "--model-loader-extra-config",
-            '{"enable_multithread_load": true}',
-            "--chat-template",
-            "mistral",
-        ]
-
-        cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url)
-        cls.runner.setup_profile_directory()
-
-    @classmethod
-    def tearDownClass(cls):
-        # Clean up environment variable
-        if "SGLANG_ENABLE_JIT_DEEPGEMM" in os.environ:
-            del os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"]
-
-    def test_bench_one_batch(self):
-        results, success = self.runner.run_benchmark_for_model(
-            model_path=self.model,
-            batch_sizes=self.batch_sizes,
-            input_lens=self.input_lens,
-            output_lens=self.output_lens,
-            other_args=self.other_args,
-        )
-
-        self.runner.add_report(results)
-        self.runner.write_final_report()
-
-        if not success:
-            raise AssertionError(
-                f"Benchmark failed for {self.model}. Check the logs for details."
-            )
-
-    def test_accuracy_mgsm(self):
-        """Run MGSM accuracy evaluation for Mistral Large 3 with Eagle."""
-        process = popen_launch_server(
-            model=self.model,
-            base_url=self.base_url,
-            other_args=self.other_args,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-        )
-
-        try:
-            args = SimpleNamespace(
-                base_url=self.base_url,
-                model=self.model,
-                eval_name="mgsm_en",
-                num_examples=None,
-                num_threads=1024,
-            )
-            metrics = run_eval(args)
-            print(f"MGSM accuracy for {self.model} with Eagle: {metrics['score']}")
-
-            # Placeholder threshold - adjust after first successful run
-            expected_threshold = 0.90
-            self.assertGreaterEqual(
-                metrics["score"],
-                expected_threshold,
-                f"MGSM accuracy {metrics['score']} below threshold {expected_threshold}",
-            )
-        finally:
-            kill_process_tree(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/nightly/test_mistral_large3_perf.py b/test/nightly/test_mistral_large3_perf.py
index c4272a5f3135..3e6feb52d1c9 100644
--- a/test/nightly/test_mistral_large3_perf.py
+++ b/test/nightly/test_mistral_large3_perf.py
@@ -17,6 +17,7 @@
 register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True)
 
 MISTRAL_LARGE3_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512"
+MISTRAL_LARGE3_EAGLE_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle"
 PROFILE_DIR = "performance_profiles_mistral_large3"
 
 
@@ -101,5 +102,103 @@ def test_accuracy_mgsm(self):
             kill_process_tree(process.pid)
 
 
+class TestNightlyMistralLarge3EaglePerformance(unittest.TestCase):
+    """Test Mistral Large 3 with Eagle speculative decoding."""
+
+    @classmethod
+    def setUpClass(cls):
+        # Set environment variable to disable JIT DeepGemm
+        os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0"
+
+        cls.model = MISTRAL_LARGE3_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.batch_sizes = [1, 1, 8, 16, 64]
+        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
+        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
+
+        # Mistral-Large-3 with Eagle speculative decoding
+        # Eagle model is used as draft model for speculative decoding
+        cls.other_args = [
+            "--tp",
+            "8",
+            "--attention-backend",
+            "trtllm_mla",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-draft-model-path",
+            MISTRAL_LARGE3_EAGLE_MODEL_PATH,
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "1",
+            "--speculative-num-draft-tokens",
+            "4",
+            "--kv-cache-dtype",
+            "auto",
+            "--model-loader-extra-config",
+            '{"enable_multithread_load": true}',
+            "--chat-template",
+            "mistral",
+        ]
+
+        cls.runner = NightlyBenchmarkRunner(
+            "performance_profiles_mistral_large3_eagle", cls.__name__, cls.base_url
+        )
+        cls.runner.setup_profile_directory()
+
+    @classmethod
+    def tearDownClass(cls):
+        # Clean up environment variable
+        if "SGLANG_ENABLE_JIT_DEEPGEMM" in os.environ:
+            del os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"]
+
+    def test_eagle_bench_one_batch(self):
+        results, success = self.runner.run_benchmark_for_model(
+            model_path=self.model,
+            batch_sizes=self.batch_sizes,
+            input_lens=self.input_lens,
+            output_lens=self.output_lens,
+            other_args=self.other_args,
+        )
+
+        self.runner.add_report(results)
+        self.runner.write_final_report()
+
+        if not success:
+            raise AssertionError(
+                f"Benchmark failed for {self.model} with Eagle. Check the logs for details."
+            )
+
+    def test_eagle_accuracy_mgsm(self):
+        """Run MGSM accuracy evaluation for Mistral Large 3 with Eagle."""
+        process = popen_launch_server(
+            model=self.model,
+            base_url=self.base_url,
+            other_args=self.other_args,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=self.base_url,
+                model=self.model,
+                eval_name="mgsm_en",
+                num_examples=None,
+                num_threads=1024,
+            )
+            metrics = run_eval(args)
+            print(f"MGSM accuracy for {self.model} with Eagle: {metrics['score']}")
+
+            # Placeholder threshold - adjust after first successful run
+            expected_threshold = 0.90
+            self.assertGreaterEqual(
+                metrics["score"],
+                expected_threshold,
+                f"MGSM accuracy {metrics['score']} below threshold {expected_threshold}",
+            )
+        finally:
+            kill_process_tree(process.pid)
+
+
 if __name__ == "__main__":
     unittest.main()