sgl-project · Fxycst1213 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025
diff --git a/benchmark/mmmu/README.md b/benchmark/mmmu/README.md
@@ -19,8 +19,7 @@ python benchmark/mmmu/bench_hf.py --model-path Qwen/Qwen2-VL-7B-Instruct
 ```
 
 Some popular model results:
-
-1. Qwen/Qwen2-VL-2B-Instruct: 0.241
-2. Qwen/Qwen2-VL-7B-Instruct: 0.255
-3. Qwen/Qwen2.5-VL-3B-Instruct: 0.245
-4. Qwen/Qwen2.5-VL-7B-Instruct: 0.242
+1. Qwen/Qwen2-VL-7B-Instruct(sglang): 0.48
+2. Qwen/Qwen2-VL-7B-Instruct(hf): 0.482
+3. OpenGVLab/InternVL2_5-38B(sglang): 0.612
+4. OpenGVLab/InternVL2_5-38B(hf): 0.61
diff --git a/benchmark/mmmu/bench_hf.py b/benchmark/mmmu/bench_hf.py
@@ -1,73 +1,48 @@
+"""
+    Bench the huggingface vLM with benchmark MMMU
+
+    Usage:
+        python benchmark/mmmu/bench_hf.py --model-path Qwen/Qwen2-VL-7B-Instruct --dataset-path
+
+    The eval output will be logged
+"""
+
 import argparse
+import random
+import re
 
 import torch
 from data_utils import save_json
 from eval_utils import (
     EvalArgs,
     eval_result,
     get_sampling_params,
+    load_model,
     prepare_samples,
     process_result,
 )
+from Qwen2VLchat import Qwen2VLchat
 from tqdm import tqdm
-from transformers import AutoModelForImageTextToText, AutoProcessor, GenerationConfig
 
 
 @torch.no_grad()
 def eval_mmmu(args):
     eval_args = EvalArgs.from_cli_args(args)
-
-    model = AutoModelForImageTextToText.from_pretrained(
-        args.model_path,
-        torch_dtype="auto",
-        trust_remote_code=True,
-    )
-    model = model.eval().cuda()
-
-    processor = AutoProcessor.from_pretrained(
-        args.model_path, torch_dtype="auto", device_map="auto"
-    )
-
+    model = load_model(args.model_path)
+    model.build_model()
     samples = prepare_samples(eval_args)
     out_samples = dict()
-
-    sampling_params = get_sampling_params(eval_args)
-    generation_config = GenerationConfig(
-        max_new_tokens=sampling_params["max_new_tokens"],
-        do_sample=False,
-    )
-
     answer_dict = {}
     for sample in tqdm(samples):
-        prompt = sample["final_input_prompt"]
-        image = sample["image"]
-        prefix = prompt.split("<")[0]
-        suffix = prompt.split(">")[1]
-        assert image is not None
-        contents = []
-        if prefix:
-            contents += [{"type": "text", "text": prefix}]
-        contents += [
-            {
-                "type": "image",
-                "image": sample["image_path"],
-            }
-        ]
-        if suffix:
-            contents += [{"type": "text", "text": suffix}]
-        messages = [{"role": "user", "content": contents}]
-        model_inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            return_dict=True,
-            add_generation_prompt=True,
-            return_tensors="pt",
-        ).to(model.device)
-        input_len = model_inputs["input_ids"].shape[-1]
-        generation = model.generate(**model_inputs, generation_config=generation_config)
-        generation = generation[0][input_len:]
-        response = processor.decode(generation, skip_special_tokens=True)
-        print(f"response: {response}")
+        image = sample["image_1"]
+        if image is not None:
+            response = model.chat(sample)
+        else:  # multiple images actually
+            if sample["question_type"] == "multiple-choice":
+                all_choices = sample["all_choices"]
+                response = random.choice(all_choices)
+            else:
+                response = "INVALID GENERATION FOR MULTIPLE IMAGE INPUTS"
         process_result(response, sample, answer_dict, out_samples)
 
     args.output_path = f"{args.model_path}_val_hf.json"

diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py
@@ -2,85 +2,82 @@
     Bench the sglang-hosted vLM with benchmark MMMU
 
     Usage:
-        python benchmark/mmmu/bench_sglang.py --model-path Qwen/Qwen2-VL-7B-Instruct --chat-template qwen2-vl
+        python benchmark/mmmu/bench_sglang.py --model-path Qwen/Qwen2-VL-7B-Instruct --chat-template qwen2-vl --dataset-path
 
     The eval output will be logged
 """
 
 import argparse
+import base64
+import dataclasses
+import random
+from io import BytesIO
 
-import openai
 from data_utils import save_json
 from eval_utils import (
     EvalArgs,
     eval_result,
     get_sampling_params,
+    load_model,
     prepare_samples,
     process_result,
 )
 from tqdm import tqdm
 
-from sglang.test.test_utils import add_common_sglang_args_and_parse
+from sglang import Engine
+from sglang.srt.conversation import generate_chat_conv
+from sglang.srt.openai_api.protocol import ChatCompletionRequest
+from sglang.srt.server_args import ServerArgs
 
 
 def eval_mmmu(args):
+    server_args = ServerArgs.from_cli_args(args)
     eval_args = EvalArgs.from_cli_args(args)
 
+    if server_args.chat_template is None:
+        raise ValueError("Chat template must be provided for this benchmark")
+    model = load_model(args.model_path)
+    backend = Engine(**dataclasses.asdict(server_args))
     out_samples = dict()
-
-    sampling_params = get_sampling_params(eval_args)
-
     samples = prepare_samples(eval_args)
 
     answer_dict = {}
 
-    # had to use an openai server, since SglImage doesn't support image data
-    client = openai.Client(api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1")
+    for sample in tqdm(samples):
+        image = sample["image_1"]
+        if image is not None:
+            request_dict = model.build_prompt_sglang(sample)
+            conv = generate_chat_conv(
+                ChatCompletionRequest(**request_dict),
+                template_name=server_args.chat_template,
+            )
+            prompt = conv.get_prompt()
+            print(f"\033[31m{prompt}\033[0m")
+            gen_out = backend.generate(
+                prompt=prompt,
+                image_data=conv.image_data,
+                sampling_params=model.sampling_params,
+            )["text"]
+            response = gen_out
+            print(f"\033[32m{response}\033[0m")
+        else:  # multiple images actually
+            if sample["question_type"] == "multiple-choice":
+                all_choices = sample["all_choices"]
+                response = random.choice(all_choices)
+            else:
+                response = "INVALID GENERATION FOR MULTIPLE IMAGE INPUTS"
 
-    for i, sample in enumerate(tqdm(samples)):
-        prompt = sample["final_input_prompt"]
-        prefix = prompt.split("<")[0]
-        suffix = prompt.split(">")[1]
-        image = sample["image"]
-        assert image is not None
-        image_path = sample["image_path"]
-        # TODO: batch
-        response = client.chat.completions.create(
-            model="default",
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": prefix,
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": image_path},
-                        },
-                        {
-                            "type": "text",
-                            "text": suffix,
-                        },
-                    ],
-                }
-            ],
-            temperature=0,
-            max_completion_tokens=sampling_params["max_new_tokens"],
-            max_tokens=sampling_params["max_new_tokens"],
-        )
-        response = response.choices[0].message.content
         process_result(response, sample, answer_dict, out_samples)
-
-    args.output_path = f"./val_sglang.json"
+    args.output_path = f"{args.model_path}_val_sglang.json"
     save_json(args.output_path, out_samples)
     eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)
 
+    backend.shutdown()
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    args = add_common_sglang_args_and_parse(parser)
+    ServerArgs.add_cli_args(parser)
     EvalArgs.add_cli_args(parser)
     args = parser.parse_args()
 

diff --git a/benchmark/mmmu/data_utils.py b/benchmark/mmmu/data_utils.py
@@ -187,7 +187,7 @@ def construct_prompt(sample, config):
         index2ans = {}
         for option in options:
             prediction_range.append(start_chr)
-            example += f"({start_chr}) {option}\n"
+            example += f"{start_chr}. {option}\n"
             index2ans[start_chr] = option
             start_chr = chr(ord(start_chr) + 1)
         empty_prompt_sample_structure = config["multi_choice_example_format"]

diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py
@@ -19,14 +19,15 @@
     process_single_sample,
 )
 from datasets import concatenate_datasets, load_dataset
-from tqdm import tqdm
+from internvl_chat import InternVLChat
+from qwen2vl_chat import Qwen2VLChat
 
 
 @dataclasses.dataclass
 class EvalArgs:
+    backend: str = "engine"
     seed: int = 42
     split: str = "validation"
-    # Default setting to make the benchmark available on A100 for most 7B models
     image_pixels_limit: int = 4300000
     result_filename: str = ""
     prompt_format_file: str = "prompt_format.yaml"
@@ -35,10 +36,10 @@ class EvalArgs:
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--backend", type=str, default=EvalArgs.backend)
         parser.add_argument(
             "--result-filename", type=str, default=EvalArgs.result_filename
         )
-
         parser.add_argument(
             "--image-pixels-limit", type=int, default=EvalArgs.image_pixels_limit
         )
@@ -107,7 +108,7 @@ def prepare_samples(eval_args: EvalArgs):
     # run for each subject
     sub_dataset_list = []
 
-    for subject in tqdm(CAT_SHORT2LONG.values()):
+    for subject in CAT_SHORT2LONG.values():
         sub_dataset = load_dataset(
             eval_args.dataset_path, subject, split=eval_args.split
         )
@@ -120,31 +121,9 @@ def prepare_samples(eval_args: EvalArgs):
     ## prepare images
     samples = []
     skip_count = 0
-
-    # use image file as input to ensure the consistency between sglang and hf
-    images_path = os.path.expanduser("~/.cache/mmmu/images")
-    os.makedirs(images_path, exist_ok=True)
-    print(f"Saving images to: {images_path}")
-
-    for i, sample in enumerate(tqdm(dataset)):
-        sample = process_single_sample(sample)
+    for i, sample in enumerate(dataset):
         sample = construct_prompt(sample, eval_args.config)
-        image = sample["image"]
-
-        width, height = image.size
-        if width * height >= eval_args.image_pixels_limit:
-            skip_count += 1
-            continue
-        image_path = f"{images_path}/image_{i}.png"
-        if not os.path.exists(image_path):
-            image.save(image_path)
-        sample["image_path"] = image_path
         samples.append(sample)
-
-    print(
-        f"skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"
-    )
-    print("samples have been prepared")
     return samples
 
 
@@ -559,3 +538,13 @@ def eval_result(model_answer_path, answer_dict):
         print(f"eval out saved to {out}")
 
     print(f"Overall accuracy: {overall_acc}")
+
+
+def load_model(path):
+    if "Qwen2-VL" in path:
+        model = Qwen2VLChat(path)
+    elif "InternVL" in path:
+        model = InternVLChat(path)
+    else:
+        raise Exception("This model is not supported yet.")
+    return model