Skip to content

Commit b87cb97

Browse files
myselvessIsotr0py
andauthored
[Model] support new model ovis2.5 (#23084)
Signed-off-by: myselvess <[email protected]> Signed-off-by: Isotr0py <[email protected]> Co-authored-by: Isotr0py <[email protected]> Co-authored-by: Isotr0py <[email protected]>
1 parent f856c33 commit b87cb97

File tree

12 files changed

+1787
-1
lines changed

12 files changed

+1787
-1
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
641641
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
642642
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
643643
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
644+
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | ✅︎ |
644645
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
645646
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
646647
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |

examples/offline_inference/vision_language.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,6 +1105,38 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
11051105
)
11061106

11071107

1108+
# Ovis2_5
1109+
def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
1110+
model_name = "AIDC-AI/Ovis2.5-2B"
1111+
1112+
engine_args = EngineArgs(
1113+
model=model_name,
1114+
max_model_len=4096,
1115+
max_num_seqs=2,
1116+
trust_remote_code=True,
1117+
dtype="half",
1118+
limit_mm_per_prompt={modality: 1},
1119+
)
1120+
if modality == "image":
1121+
placeholder = "<image>"
1122+
elif modality == "video":
1123+
placeholder = "<video>"
1124+
1125+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
1126+
messages = [
1127+
[{"role": "user", "content": f"{placeholder}\n{question}"}]
1128+
for question in questions
1129+
]
1130+
prompts = tokenizer.apply_chat_template(
1131+
messages, tokenize=False, add_generation_prompt=True
1132+
)
1133+
1134+
return ModelRequestData(
1135+
engine_args=engine_args,
1136+
prompts=prompts,
1137+
)
1138+
1139+
11081140
# PaliGemma
11091141
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
11101142
assert modality == "image"
@@ -1579,6 +1611,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
15791611
"nemotron_vl": run_nemotron_vl,
15801612
"NVLM_D": run_nvlm_d,
15811613
"ovis": run_ovis,
1614+
"ovis2_5": run_ovis2_5,
15821615
"paligemma": run_paligemma,
15831616
"paligemma2": run_paligemma2,
15841617
"phi3_v": run_phi3v,

examples/offline_inference/vision_language_multi_image.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,36 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
680680
)
681681

682682

683+
# ovis2_5
684+
def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
685+
model_name = "AIDC-AI/Ovis2.5-2B"
686+
687+
engine_args = EngineArgs(
688+
model=model_name,
689+
max_model_len=8192,
690+
max_num_seqs=2,
691+
trust_remote_code=True,
692+
dtype="half",
693+
limit_mm_per_prompt={"image": len(image_urls)},
694+
)
695+
696+
placeholders = "\n".join(
697+
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
698+
)
699+
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
700+
701+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
702+
prompt = tokenizer.apply_chat_template(
703+
messages, tokenize=False, add_generation_prompt=True
704+
)
705+
706+
return ModelRequestData(
707+
engine_args=engine_args,
708+
prompt=prompt,
709+
image_data=[fetch_image(url) for url in image_urls],
710+
)
711+
712+
683713
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
684714
model_name = "mistral-community/pixtral-12b"
685715

@@ -1155,6 +1185,7 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
11551185
"mllama": load_mllama,
11561186
"NVLM_D": load_nvlm_d,
11571187
"ovis": load_ovis,
1188+
"ovis2_5": load_ovis2_5,
11581189
"phi3_v": load_phi3v,
11591190
"phi4_mm": load_phi4mm,
11601191
"phi4_multimodal": load_phi4_multimodal,

tests/models/multimodal/generation/test_common.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pytest
1212
from transformers import (AutoModel, AutoModelForImageTextToText,
1313
AutoModelForTextToWaveform, AutoModelForVision2Seq)
14+
from transformers.utils import is_flash_attn_2_available
1415

1516
from vllm.platforms import current_platform
1617
from vllm.utils import identity
@@ -621,6 +622,26 @@
621622
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
622623
patch_hf_runner=model_utils.ovis_patch_hf_runner,
623624
),
625+
"ovis2_5": VLMTestInfo(
626+
models=["AIDC-AI/Ovis2.5-2B"],
627+
test_type=(
628+
VLMTestType.IMAGE,
629+
VLMTestType.MULTI_IMAGE,
630+
VLMTestType.VIDEO
631+
),
632+
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
633+
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
634+
video_idx_to_prompt=lambda idx: "<video>\n",
635+
max_model_len=4096,
636+
max_num_seqs=2,
637+
dtype="half",
638+
num_logprobs=10,
639+
patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
640+
marks=[pytest.mark.skipif(
641+
not is_flash_attn_2_available(),
642+
reason="HF model needs `flash_attn` installed"
643+
)],
644+
),
624645
"phi3v": VLMTestInfo(
625646
models=["microsoft/Phi-3.5-vision-instruct"],
626647
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),

tests/models/multimodal/generation/vlm_utils/model_utils.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import numpy as np
1212
import numpy.typing as npt
13+
import PIL.Image
1314
import pytest
1415
import regex as re
1516
import torch
@@ -810,6 +811,63 @@ def processor(*args, text="", images=None, **kwargs):
810811
return hf_model
811812

812813

814+
def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
815+
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
816+
hf_model.model.get_output_embeddings = lambda: \
817+
hf_model.model.llm.get_output_embeddings()
818+
819+
def processor(*args, text="", images=None, videos=None, **kwargs):
820+
if images is None:
821+
images = []
822+
else:
823+
images = [images] if isinstance(images, Image) else images
824+
if videos is None:
825+
videos = []
826+
else:
827+
videos = [videos] if isinstance(videos, np.ndarray) else videos
828+
videos = [[PIL.Image.fromarray(frame) for frame in vid]
829+
for vid in videos]
830+
831+
prompt_start_and_end = {
832+
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
833+
"llama":
834+
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
835+
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
836+
}
837+
for start, end in prompt_start_and_end.values():
838+
if start in text and end in text:
839+
text = text.split(start)[1].split(end)[0]
840+
break
841+
842+
images_message = [{"type": "image", "image": img} for img in images]
843+
videos_message = [{"type": "video", "video": vid} for vid in videos]
844+
845+
messages = [{
846+
"role":
847+
"user",
848+
"content": [
849+
*images_message,
850+
*videos_message,
851+
{
852+
"type": "text",
853+
"text": text
854+
},
855+
],
856+
}]
857+
858+
input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
859+
messages=messages, enable_thinking=True)
860+
inputs = {
861+
"inputs": input_ids,
862+
"pixel_values": pixel_values,
863+
"grid_thws": grid_thws,
864+
}
865+
return BatchFeature(data=inputs, tensor_type="pt")
866+
867+
hf_model.processor = processor
868+
return hf_model
869+
870+
813871
def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
814872
"""Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
815873
thinker = hf_model.model.thinker

tests/models/multimodal/processing/test_common.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ def _test_processing_correctness(
162162
_ADD_SPECIAL_TOKENS_OVERRIDES = {
163163
"mllama": False,
164164
"ovis": False,
165+
"ovis2_5": False,
165166
"paligemma": False,
166167
"ultravox": False,
167168
"whisper": False,
@@ -301,6 +302,7 @@ def _test_processing_correctness_one(
301302
"AIDC-AI/Ovis1.6-Gemma2-9B",
302303
"AIDC-AI/Ovis1.6-Llama3.2-3B",
303304
"AIDC-AI/Ovis2-1B",
305+
"AIDC-AI/Ovis2.5-2B",
304306
"google/paligemma-3b-mix-224",
305307
"google/paligemma2-3b-ft-docci-448",
306308
"microsoft/Phi-3.5-vision-instruct",

tests/models/registry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,9 @@ def check_available_online(
464464
transformers_version_reason="HF model is not compatible", # noqa: E501
465465
extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
466466
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
467+
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True,
468+
max_transformers_version="4.53",
469+
transformers_version_reason="HF model is not compatible"), # noqa: E501
467470
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501
468471
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
469472
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",

0 commit comments

Comments
 (0)