diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 2c0bd52c0e3e..a5d2d7f41d8a 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -394,18 +394,24 @@ def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData:
def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
+ mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
- limit_mm_per_prompt={modality: 1},
+ limit_mm_per_prompt=mm_limit,
trust_remote_code=True,
)
+ image_placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+ video_placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
if modality == "image":
- placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+ placeholder = image_placeholder
elif modality == "video":
- placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+ placeholder = video_placeholder
+ elif modality == "image+video":
+ placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -425,6 +431,7 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "LGAI-EXAONE/EXAONE-4.5-33B"
+ mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -434,18 +441,23 @@ def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
- limit_mm_per_prompt={modality: 1},
+ limit_mm_per_prompt=mm_limit,
)
+ image_placeholder = "<|image_pad|>"
+ video_placeholder = "<|video_pad|>"
+
if modality == "image":
- placeholder = "<|image_pad|>"
+ placeholder = image_placeholder
elif modality == "video":
- placeholder = "<|video_pad|>"
+ placeholder = video_placeholder
+ elif modality == "image+video":
+ placeholder = image_placeholder + video_placeholder
prompts = [
(
"<|system|>\nYou are a helpful assistant.<|endofturn|>\n"
- f"<|user|>\n{placeholder}"
+ f"<|user|>\n{placeholder}"
f"{question}<|endofturn|>\n"
"<|assistant|>\n"
)
@@ -566,6 +578,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.1V-9B-Thinking"
+ mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -574,14 +587,19 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
- limit_mm_per_prompt={modality: 1},
+ limit_mm_per_prompt=mm_limit,
enforce_eager=True,
)
+ image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+ video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
if modality == "image":
- placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+ placeholder = image_placeholder
elif modality == "video":
- placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+ placeholder = video_placeholder
+ elif modality == "image+video":
+ placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -602,6 +620,7 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V"
+ mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -610,15 +629,20 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
- limit_mm_per_prompt={modality: 1},
+ limit_mm_per_prompt=mm_limit,
enforce_eager=True,
tensor_parallel_size=4,
)
+ image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+ video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
if modality == "image":
- placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+ placeholder = image_placeholder
elif modality == "video":
- placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+ placeholder = video_placeholder
+ elif modality == "image+video":
+ placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -639,6 +663,7 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8"
+ mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -647,15 +672,20 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
- limit_mm_per_prompt={modality: 1},
+ limit_mm_per_prompt=mm_limit,
enforce_eager=True,
tensor_parallel_size=4,
)
+ image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+ video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
if modality == "image":
- placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+ placeholder = image_placeholder
elif modality == "video":
- placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+ placeholder = video_placeholder
+ elif modality == "image+video":
+ placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -676,6 +706,7 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-OCR"
+ mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -684,14 +715,19 @@ def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
- limit_mm_per_prompt={modality: 1},
+ limit_mm_per_prompt=mm_limit,
enforce_eager=True,
)
+ image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+ video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
if modality == "image":
- placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+ placeholder = image_placeholder
elif modality == "video":
- placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+ placeholder = video_placeholder
+ elif modality == "image+video":
+ placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -772,11 +808,12 @@ def run_hyperclovax_seed_vision(
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+ mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
- max_model_len=8192 if modality == "image" else 16384,
- limit_mm_per_prompt={modality: 1},
+ max_model_len=16384 if modality in ("video", "image+video") else 8192,
+ limit_mm_per_prompt=mm_limit,
)
messages = list()
@@ -828,6 +865,29 @@ def run_hyperclovax_seed_vision(
}
]
)
+ elif modality == "image+video":
+ messages.append(
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "ocr": "",
+ "lens_keywords": "",
+ "lens_local_keywords": "",
+ },
+ {
+ "type": "video",
+ },
+ {
+ "type": "text",
+ "text": question,
+ },
+ ],
+ }
+ ]
+ )
else:
raise ValueError(f"Unsupported modality: {modality}")
@@ -876,19 +936,25 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1-mini"
+ mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
- limit_mm_per_prompt={modality: 1},
+ limit_mm_per_prompt=mm_limit,
enforce_eager=True,
)
+ image_placeholder = ""
+ video_placeholder = "