From a1ae0982bb81a06e5395894068178f5d30a6bb8a Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 15 Jun 2025 15:41:30 +0800 Subject: [PATCH 1/3] add missing llava multi images example Signed-off-by: Isotr0py <2037008807@qq.com> --- .../vision_language_multi_image.py | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index ea7a793d026b..5852e1c13436 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -289,6 +289,105 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "llava-hf/llava-1.5-7b-hf" + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "llava-hf/llava-v1.6-mistral-7b-hf" + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf" + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" @@ -737,6 +836,9 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: "idefics3": load_idefics3, "internvl_chat": load_internvl, "kimi_vl": load_kimi_vl, + "llava": load_llava, + "llava-next": load_llava_next, + "llava-onevision": load_llava_onevision, "llama4": load_llama4, "mistral3": load_mistral3, "mllama": load_mllama, From 23d87a734c68fbb5cf4ba7ca4f3cb01c5c490a47 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 16 Jun 2025 17:12:36 +0800 Subject: [PATCH 2/3] fix Signed-off-by: Isotr0py <2037008807@qq.com> --- examples/offline_inference/vision_language_multi_image.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 5852e1c13436..e06c128e2d28 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -293,7 +293,6 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "llava-hf/llava-1.5-7b-hf" engine_args = EngineArgs( model=model_name, - max_model_len=8192, max_num_seqs=16, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -359,7 +358,7 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf" engine_args = EngineArgs( model=model_name, - max_model_len=8192, + max_model_len=16384, max_num_seqs=16, limit_mm_per_prompt={"image": len(image_urls)}, ) From 7da8ee55044c9d6d8bad5e8caca12b001866c3b0 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 17 Jun 2025 00:38:03 +0800 Subject: [PATCH 3/3] add caution Signed-off-by: Isotr0py <2037008807@qq.com> --- examples/offline_inference/vision_language_multi_image.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index e06c128e2d28..e55181e4f490 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -290,6 +290,8 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: + # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs, + # it will generate poor response for multi-image inputs! model_name = "llava-hf/llava-1.5-7b-hf" engine_args = EngineArgs( model=model_name,