diff --git a/python/sglang/test/simple_eval_mmmu_vlm.py b/python/sglang/test/simple_eval_mmmu_vlm.py index f647340ea4be..e05885e9d739 100644 --- a/python/sglang/test/simple_eval_mmmu_vlm.py +++ b/python/sglang/test/simple_eval_mmmu_vlm.py @@ -148,12 +148,20 @@ def _key(idx): options = None # Build final textual prompt; include choices if MC - prompt_text = f"Question: {question}\n\n" + prompt_text = f"{question}\n" if options: letters = [chr(ord("A") + i) for i in range(len(options))] for letter, opt in zip(letters, options): - prompt_text += f"{letter}) {opt}\n" - prompt_text += "\nAnswer: " + prompt_text += f"{letter}. {opt}\n" + prompt_text += ( + "\nAnswer the following multiple-choice question. " + "The last line of your response should be of the " + "following format: 'Answer: $LETTER' (without quotes) " + "where LETTER is one of the options. " + "Think step by step before answering." + ) + else: + prompt_text += "\nAnswer: " samples.append( { @@ -330,6 +338,14 @@ def _parse_multi_choice_response( response: str, all_choices: List[str], index2ans: dict ) -> str: # loosely adapted from benchmark mmmu eval + + # First, look for explicit "Answer: X" pattern (last occurrence) + answer_matches = re.findall(r"[Aa]nswer\s*:\s*\*?\*?\s*\(?([A-Z])\)?", response) + if answer_matches: + candidate = answer_matches[-1] + if candidate in all_choices: + return candidate + for char in [",", ".", "!", "?", ";", ":", "'"]: response = response.strip(char) response = " " + response + " " diff --git a/test/registered/vlm/test_vlm_tp4.py b/test/registered/vlm/test_vlm_tp4.py new file mode 100644 index 000000000000..d62df9b0b985 --- /dev/null +++ b/test/registered/vlm/test_vlm_tp4.py @@ -0,0 +1,82 @@ +""" +VLM TP=4 per-commit test using Qwen3.5-27B with MMMU evaluation. +""" + +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +register_cuda_ci(est_time=200, suite="stage-c-test-4-gpu-h100") + +QWEN35_27B_MODEL = "Qwen/Qwen3.5-27B" +MMMU_ACCURACY_THRESHOLD = 0.65 +MMMU_NUM_EXAMPLES = 32 + + +class TestVLMTP4(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = QWEN35_27B_MODEL + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--tp-size", + "4", + "--cuda-graph-max-bs", + "32", + "--mem-fraction-static", + "0.8", + "--trust-remote-code", + "--mamba-scheduler-strategy", + "extra_buffer", + "--mamba-track-interval", + "128", + "--mamba-ssm-dtype", + "bfloat16", + "--chunked-prefill-size", + "2048", + "--max-running-requests", + "128", + ], + ) + + @classmethod + def tearDownClass(cls): + if hasattr(cls, "process") and cls.process: + kill_process_tree(cls.process.pid) + + def test_mmmu_accuracy(self): + args = SimpleNamespace( + model=self.model, + eval_name="mmmu", + num_examples=MMMU_NUM_EXAMPLES, + num_threads=16, + max_tokens=2048, + chat_template_kwargs={"enable_thinking": False}, + base_url=self.base_url, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval(args) + print(f"MMMU score: {metrics['score']}") + self.assertGreaterEqual( + metrics["score"], + MMMU_ACCURACY_THRESHOLD, + f"MMMU accuracy {metrics['score']:.4f} below threshold {MMMU_ACCURACY_THRESHOLD}", + ) + + +if __name__ == "__main__": + unittest.main()