Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions benchmark/mmmu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ python benchmark/mmmu/bench_hf.py --model-path Qwen/Qwen2-VL-7B-Instruct
```

Some popular model results:

1. Qwen/Qwen2-VL-2B-Instruct: 0.241
2. Qwen/Qwen2-VL-7B-Instruct: 0.255
3. Qwen/Qwen2.5-VL-3B-Instruct: 0.245
4. Qwen/Qwen2.5-VL-7B-Instruct: 0.242
1. Qwen/Qwen2-VL-7B-Instruct(sglang): 0.48
2. Qwen/Qwen2-VL-7B-Instruct(hf): 0.482
3. OpenGVLab/InternVL2_5-38B(sglang): 0.612
4. OpenGVLab/InternVL2_5-38B(hf): 0.61
73 changes: 24 additions & 49 deletions benchmark/mmmu/bench_hf.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,48 @@
"""
Bench the huggingface vLM with benchmark MMMU

Usage:
python benchmark/mmmu/bench_hf.py --model-path Qwen/Qwen2-VL-7B-Instruct --dataset-path

The eval output will be logged
"""

import argparse
import random
import re

import torch
from data_utils import save_json
from eval_utils import (
EvalArgs,
eval_result,
get_sampling_params,
load_model,
prepare_samples,
process_result,
)
from Qwen2VLchat import Qwen2VLchat
from tqdm import tqdm
from transformers import AutoModelForImageTextToText, AutoProcessor, GenerationConfig


@torch.no_grad()
def eval_mmmu(args):
eval_args = EvalArgs.from_cli_args(args)

model = AutoModelForImageTextToText.from_pretrained(
args.model_path,
torch_dtype="auto",
trust_remote_code=True,
)
model = model.eval().cuda()

processor = AutoProcessor.from_pretrained(
args.model_path, torch_dtype="auto", device_map="auto"
)

model = load_model(args.model_path)
model.build_model()
samples = prepare_samples(eval_args)
out_samples = dict()

sampling_params = get_sampling_params(eval_args)
generation_config = GenerationConfig(
max_new_tokens=sampling_params["max_new_tokens"],
do_sample=False,
)

answer_dict = {}
for sample in tqdm(samples):
prompt = sample["final_input_prompt"]
image = sample["image"]
prefix = prompt.split("<")[0]
suffix = prompt.split(">")[1]
assert image is not None
contents = []
if prefix:
contents += [{"type": "text", "text": prefix}]
contents += [
{
"type": "image",
"image": sample["image_path"],
}
]
if suffix:
contents += [{"type": "text", "text": suffix}]
messages = [{"role": "user", "content": contents}]
model_inputs = processor.apply_chat_template(
messages,
tokenize=True,
return_dict=True,
add_generation_prompt=True,
return_tensors="pt",
).to(model.device)
input_len = model_inputs["input_ids"].shape[-1]
generation = model.generate(**model_inputs, generation_config=generation_config)
generation = generation[0][input_len:]
response = processor.decode(generation, skip_special_tokens=True)
print(f"response: {response}")
image = sample["image_1"]
if image is not None:
response = model.chat(sample)
else: # multiple images actually
if sample["question_type"] == "multiple-choice":
all_choices = sample["all_choices"]
response = random.choice(all_choices)
else:
response = "INVALID GENERATION FOR MULTIPLE IMAGE INPUTS"
process_result(response, sample, answer_dict, out_samples)

args.output_path = f"{args.model_path}_val_hf.json"
Expand Down
87 changes: 42 additions & 45 deletions benchmark/mmmu/bench_sglang.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,85 +2,82 @@
Bench the sglang-hosted vLM with benchmark MMMU

Usage:
python benchmark/mmmu/bench_sglang.py --model-path Qwen/Qwen2-VL-7B-Instruct --chat-template qwen2-vl
python benchmark/mmmu/bench_sglang.py --model-path Qwen/Qwen2-VL-7B-Instruct --chat-template qwen2-vl --dataset-path

The eval output will be logged
"""

import argparse
import base64
import dataclasses
import random
from io import BytesIO

import openai
from data_utils import save_json
from eval_utils import (
EvalArgs,
eval_result,
get_sampling_params,
load_model,
prepare_samples,
process_result,
)
from tqdm import tqdm

from sglang.test.test_utils import add_common_sglang_args_and_parse
from sglang import Engine
from sglang.srt.conversation import generate_chat_conv
from sglang.srt.openai_api.protocol import ChatCompletionRequest
from sglang.srt.server_args import ServerArgs


def eval_mmmu(args):
server_args = ServerArgs.from_cli_args(args)
eval_args = EvalArgs.from_cli_args(args)

if server_args.chat_template is None:
raise ValueError("Chat template must be provided for this benchmark")
model = load_model(args.model_path)
backend = Engine(**dataclasses.asdict(server_args))
out_samples = dict()

sampling_params = get_sampling_params(eval_args)

samples = prepare_samples(eval_args)

answer_dict = {}

# had to use an openai server, since SglImage doesn't support image data
client = openai.Client(api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1")
for sample in tqdm(samples):
image = sample["image_1"]
if image is not None:
request_dict = model.build_prompt_sglang(sample)
conv = generate_chat_conv(
ChatCompletionRequest(**request_dict),
template_name=server_args.chat_template,
)
prompt = conv.get_prompt()
print(f"\033[31m{prompt}\033[0m")
gen_out = backend.generate(
prompt=prompt,
image_data=conv.image_data,
sampling_params=model.sampling_params,
)["text"]
response = gen_out
print(f"\033[32m{response}\033[0m")
else: # multiple images actually
if sample["question_type"] == "multiple-choice":
all_choices = sample["all_choices"]
response = random.choice(all_choices)
else:
response = "INVALID GENERATION FOR MULTIPLE IMAGE INPUTS"

for i, sample in enumerate(tqdm(samples)):
prompt = sample["final_input_prompt"]
prefix = prompt.split("<")[0]
suffix = prompt.split(">")[1]
image = sample["image"]
assert image is not None
image_path = sample["image_path"]
# TODO: batch
response = client.chat.completions.create(
model="default",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prefix,
},
{
"type": "image_url",
"image_url": {"url": image_path},
},
{
"type": "text",
"text": suffix,
},
],
}
],
temperature=0,
max_completion_tokens=sampling_params["max_new_tokens"],
max_tokens=sampling_params["max_new_tokens"],
)
response = response.choices[0].message.content
process_result(response, sample, answer_dict, out_samples)

args.output_path = f"./val_sglang.json"
args.output_path = f"{args.model_path}_val_sglang.json"
save_json(args.output_path, out_samples)
eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)

backend.shutdown()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
args = add_common_sglang_args_and_parse(parser)
ServerArgs.add_cli_args(parser)
EvalArgs.add_cli_args(parser)
args = parser.parse_args()

Expand Down
2 changes: 1 addition & 1 deletion benchmark/mmmu/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def construct_prompt(sample, config):
index2ans = {}
for option in options:
prediction_range.append(start_chr)
example += f"({start_chr}) {option}\n"
example += f"{start_chr}. {option}\n"
index2ans[start_chr] = option
start_chr = chr(ord(start_chr) + 1)
empty_prompt_sample_structure = config["multi_choice_example_format"]
Expand Down
43 changes: 16 additions & 27 deletions benchmark/mmmu/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@
process_single_sample,
)
from datasets import concatenate_datasets, load_dataset
from tqdm import tqdm
from internvl_chat import InternVLChat
from qwen2vl_chat import Qwen2VLChat


@dataclasses.dataclass
class EvalArgs:
backend: str = "engine"
seed: int = 42
split: str = "validation"
# Default setting to make the benchmark available on A100 for most 7B models
image_pixels_limit: int = 4300000
result_filename: str = ""
prompt_format_file: str = "prompt_format.yaml"
Expand All @@ -35,10 +36,10 @@ class EvalArgs:

@staticmethod
def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument("--backend", type=str, default=EvalArgs.backend)
parser.add_argument(
"--result-filename", type=str, default=EvalArgs.result_filename
)

parser.add_argument(
"--image-pixels-limit", type=int, default=EvalArgs.image_pixels_limit
)
Expand Down Expand Up @@ -107,7 +108,7 @@ def prepare_samples(eval_args: EvalArgs):
# run for each subject
sub_dataset_list = []

for subject in tqdm(CAT_SHORT2LONG.values()):
for subject in CAT_SHORT2LONG.values():
sub_dataset = load_dataset(
eval_args.dataset_path, subject, split=eval_args.split
)
Expand All @@ -120,31 +121,9 @@ def prepare_samples(eval_args: EvalArgs):
## prepare images
samples = []
skip_count = 0

# use image file as input to ensure the consistency between sglang and hf
images_path = os.path.expanduser("~/.cache/mmmu/images")
os.makedirs(images_path, exist_ok=True)
print(f"Saving images to: {images_path}")

for i, sample in enumerate(tqdm(dataset)):
sample = process_single_sample(sample)
for i, sample in enumerate(dataset):
sample = construct_prompt(sample, eval_args.config)
image = sample["image"]

width, height = image.size
if width * height >= eval_args.image_pixels_limit:
skip_count += 1
continue
image_path = f"{images_path}/image_{i}.png"
if not os.path.exists(image_path):
image.save(image_path)
sample["image_path"] = image_path
samples.append(sample)

print(
f"skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"
)
print("samples have been prepared")
return samples


Expand Down Expand Up @@ -559,3 +538,13 @@ def eval_result(model_answer_path, answer_dict):
print(f"eval out saved to {out}")

print(f"Overall accuracy: {overall_acc}")


def load_model(path):
if "Qwen2-VL" in path:
model = Qwen2VLChat(path)
elif "InternVL" in path:
model = InternVLChat(path)
else:
raise Exception("This model is not supported yet.")
return model
Loading
Loading