diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 2c0bd52c0e3e..a5d2d7f41d8a 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -394,18 +394,24 @@ def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData: def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData: model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT" + mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1} engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=5, - limit_mm_per_prompt={modality: 1}, + limit_mm_per_prompt=mm_limit, trust_remote_code=True, ) + image_placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>" + video_placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>" + if modality == "image": - placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>" + placeholder = image_placeholder elif modality == "video": - placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>" + placeholder = video_placeholder + elif modality == "image+video": + placeholder = image_placeholder + video_placeholder prompts = [ ( @@ -425,6 +431,7 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData: def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData: model_name = "LGAI-EXAONE/EXAONE-4.5-33B" + mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1} engine_args = EngineArgs( model=model_name, max_model_len=4096, @@ -434,18 +441,23 @@ def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData: "max_pixels": 1280 * 28 * 28, "fps": 1, }, - limit_mm_per_prompt={modality: 1}, + limit_mm_per_prompt=mm_limit, ) + image_placeholder = "<|image_pad|>" + video_placeholder = "<|video_pad|>" + if modality == "image": - placeholder = "<|image_pad|>" + placeholder = image_placeholder elif modality == "video": - placeholder = "<|video_pad|>" + placeholder = video_placeholder + elif modality == "image+video": + placeholder = image_placeholder + video_placeholder prompts = [ ( "<|system|>\nYou are a helpful assistant.<|endofturn|>\n" - f"<|user|>\n{placeholder}" + f"<|user|>\n{placeholder}" f"{question}<|endofturn|>\n" "<|assistant|>\n" ) @@ -566,6 +578,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.1V-9B-Thinking" + mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1} engine_args = EngineArgs( model=model_name, max_model_len=4096, @@ -574,14 +587,19 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, - limit_mm_per_prompt={modality: 1}, + limit_mm_per_prompt=mm_limit, enforce_eager=True, ) + image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + if modality == "image": - placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + placeholder = image_placeholder elif modality == "video": - placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + placeholder = video_placeholder + elif modality == "image+video": + placeholder = image_placeholder + video_placeholder prompts = [ ( @@ -602,6 +620,7 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.5V" + mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1} engine_args = EngineArgs( model=model_name, max_model_len=4096, @@ -610,15 +629,20 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, - limit_mm_per_prompt={modality: 1}, + limit_mm_per_prompt=mm_limit, enforce_eager=True, tensor_parallel_size=4, ) + image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + if modality == "image": - placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + placeholder = image_placeholder elif modality == "video": - placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + placeholder = video_placeholder + elif modality == "image+video": + placeholder = image_placeholder + video_placeholder prompts = [ ( @@ -639,6 +663,7 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.5V-FP8" + mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1} engine_args = EngineArgs( model=model_name, max_model_len=4096, @@ -647,15 +672,20 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, - limit_mm_per_prompt={modality: 1}, + limit_mm_per_prompt=mm_limit, enforce_eager=True, tensor_parallel_size=4, ) + image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + if modality == "image": - placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + placeholder = image_placeholder elif modality == "video": - placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + placeholder = video_placeholder + elif modality == "image+video": + placeholder = image_placeholder + video_placeholder prompts = [ ( @@ -676,6 +706,7 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-OCR" + mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1} engine_args = EngineArgs( model=model_name, max_model_len=4096, @@ -684,14 +715,19 @@ def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData: "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, - limit_mm_per_prompt={modality: 1}, + limit_mm_per_prompt=mm_limit, enforce_eager=True, ) + image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + if modality == "image": - placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + placeholder = image_placeholder elif modality == "video": - placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + placeholder = video_placeholder + elif modality == "image+video": + placeholder = image_placeholder + video_placeholder prompts = [ ( @@ -772,11 +808,12 @@ def run_hyperclovax_seed_vision( model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1} engine_args = EngineArgs( model=model_name, trust_remote_code=True, - max_model_len=8192 if modality == "image" else 16384, - limit_mm_per_prompt={modality: 1}, + max_model_len=16384 if modality in ("video", "image+video") else 8192, + limit_mm_per_prompt=mm_limit, ) messages = list() @@ -828,6 +865,29 @@ def run_hyperclovax_seed_vision( } ] ) + elif modality == "image+video": + messages.append( + [ + { + "role": "user", + "content": [ + { + "type": "image", + "ocr": "", + "lens_keywords": "", + "lens_local_keywords": "", + }, + { + "type": "video", + }, + { + "type": "text", + "text": question, + }, + ], + } + ] + ) else: raise ValueError(f"Unsupported modality: {modality}") @@ -876,19 +936,25 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1-mini" + mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1} engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, max_num_seqs=2, - limit_mm_per_prompt={modality: 1}, + limit_mm_per_prompt=mm_limit, enforce_eager=True, ) + image_placeholder = "" + video_placeholder = "