sgl-project · Fridge003 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
@@ -687,10 +687,6 @@ jobs:
         run: |
           source /etc/profile.d/sglang-ci.sh
           CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
-          pip install "setuptools==70.0.0"
-          git clone https://github.com/merrymercy/human-eval.git
-          cd human-eval
-          pip install -e . --no-build-isolation
 
       - name: Run test
         timeout-minutes: 30
@@ -802,10 +798,6 @@ jobs:
         timeout-minutes: 20
         run: |
           CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
-          pip install "setuptools==70.0.0"
-          git clone https://github.com/merrymercy/human-eval.git
-          cd human-eval
-          pip install -e . --no-build-isolation
 
       - name: Run test
         timeout-minutes: 30

diff --git a/scripts/ci/cuda/ci_install_dependency.sh b/scripts/ci/cuda/ci_install_dependency.sh
@@ -352,6 +352,13 @@ mark_step_done "Fix other dependencies"
 # can delete the .pth file without reliably recreating it (pip race condition).
 $PIP_CMD install "nvidia-cutlass-dsl>=4.4.1" "nvidia-cutlass-dsl-libs-base>=4.4.1" --no-deps --force-reinstall $PIP_INSTALL_SUFFIX || true
 
+
+# Install human-eval
+pip install "setuptools==70.0.0"
+git clone https://github.com/merrymercy/human-eval.git
+cd human-eval
+pip install -e . --no-build-isolation
+
 # ------------------------------------------------------------------------------
 # Prepare runner
 # ------------------------------------------------------------------------------

diff --git a/test/registered/distributed/test_data_parallelism.py b/test/registered/distributed/test_data_parallelism.py
@@ -5,7 +5,7 @@
 
 from sglang.srt.utils import kill_process_tree
 from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
-from sglang.test.kits.eval_accuracy_kit import MMLUMixin
+from sglang.test.kits.eval_accuracy_kit import GSM8KMixin
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -18,10 +18,8 @@
 register_amd_ci(est_time=73, suite="stage-b-test-2-gpu-large-amd")
 
 
-class TestDataParallelism(CustomTestCase, MMLUMixin):
-    mmlu_score_threshold = 0.65
-    mmlu_num_examples = 64
-    mmlu_num_threads = 32
+class TestDataParallelism(CustomTestCase, GSM8KMixin):
+    gsm8k_accuracy_thres = 0.7
 
     @classmethod
     def setUpClass(cls):

diff --git a/test/registered/distributed/test_dp_attention.py b/test/registered/distributed/test_dp_attention.py
@@ -9,7 +9,7 @@
 from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.kits.ebnf_constrained_kit import EBNFConstrainedMixin
-from sglang.test.kits.eval_accuracy_kit import MGSMEnMixin
+from sglang.test.kits.eval_accuracy_kit import GSM8KMixin
 from sglang.test.kits.json_constrained_kit import JSONConstrainedMixin
 from sglang.test.kits.radix_cache_server_kit import run_radix_attention_test
 from sglang.test.kits.regex_constrained_kit import RegexConstrainedMixin
@@ -30,16 +30,16 @@
 
 class TestDPAttentionDP2TP2(
     CustomTestCase,
-    MGSMEnMixin,
+    GSM8KMixin,
     JSONConstrainedMixin,
     EBNFConstrainedMixin,
     RegexConstrainedMixin,
 ):
-    mgsm_en_score_threshold = 0.8
+    gsm8k_accuracy_thres = 0.6
 
     @classmethod
     def setUpClass(cls):
-        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls._env_override = envs.SGLANG_DISABLE_CONSECUTIVE_PREFILL_OVERLAP.override(
             True

diff --git a/test/registered/eval/test_moe_eval_accuracy_large.py b/test/registered/eval/test_moe_eval_accuracy_large.py
@@ -8,7 +8,7 @@
 
 from sglang.srt.utils import kill_process_tree
 from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
-from sglang.test.kits.eval_accuracy_kit import HumanEvalMixin, MGSMEnMixin, MMLUMixin
+from sglang.test.kits.eval_accuracy_kit import GSM8KMixin
 from sglang.test.test_utils import (
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -22,10 +22,8 @@
 register_amd_ci(est_time=500, suite="stage-b-test-2-gpu-large-amd")
 
 
-class TestMoEEvalAccuracyLarge(CustomTestCase, MMLUMixin, HumanEvalMixin, MGSMEnMixin):
-    mmlu_score_threshold = 0.62
-    humaneval_score_threshold = 0.40
-    mgsm_en_score_threshold = 0.61
+class TestMoEEvalAccuracyLarge(CustomTestCase, GSM8KMixin):
+    gsm8k_accuracy_thres = 0.6
 
     @classmethod
     def setUpClass(cls):

diff --git a/test/registered/moe/test_moe_ep.py b/test/registered/moe/test_moe_ep.py
@@ -3,9 +3,9 @@
 
 from sglang.srt.utils import kill_process_tree
 from sglang.test.ci.ci_register import register_cuda_ci
-from sglang.test.run_eval import run_eval
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
-    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
@@ -18,7 +18,7 @@
 class TestEp(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(
             cls.model,
@@ -37,23 +37,26 @@ def setUpClass(cls):
     def tearDownClass(cls):
         kill_process_tree(cls.process.pid)
 
-    def test_mgsm_en(self):
+    def test_gsm8k(self):
         args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
         )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
 
-        metrics = run_eval(args)
-        self.assertGreaterEqual(metrics["score"], 0.8)
+        self.assertGreater(metrics["accuracy"], 0.60)
 
 
 class TestEpDeepGEMM(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(
             cls.model,
@@ -76,17 +79,20 @@ def setUpClass(cls):
     def tearDownClass(cls):
         kill_process_tree(cls.process.pid)
 
-    def test_mgsm_en(self):
+    def test_gsm8k(self):
         args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
         )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
 
-        metrics = run_eval(args)
-        self.assertGreaterEqual(metrics["score"], 0.8)
+        self.assertGreater(metrics["accuracy"], 0.60)
 
 
 if __name__ == "__main__":