diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index b8e31e274de4..fdc851493a62 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -489,13 +489,14 @@ def __init__(self, hf_runner: HfRunner):
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs):
- from vllm.model_executor.models.h2ovl import (
- IMG_CONTEXT,
- IMG_END,
- IMG_START,
+ from vllm.transformers_utils.processors.h2ovl import (
image_to_pixel_values_h2ovl,
)
+ IMG_START = "
"
+ IMG_END = ""
+ IMG_CONTEXT = ""
+
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_h2ovl(
@@ -751,16 +752,17 @@ def __init__(self, hf_runner: HfRunner):
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs):
- from vllm.model_executor.models.skyworkr1v import (
- IMG_CONTEXT,
- IMG_END,
- IMG_START,
- image_to_pixel_values_skyworkr1v,
+ from vllm.transformers_utils.processors.internvl import (
+ image_to_pixel_values_internvl,
)
+ IMG_START = "
"
+ IMG_END = ""
+ IMG_CONTEXT = ""
+
images = [images] if isinstance(images, Image) else images
pixel_values = [
- image_to_pixel_values_skyworkr1v(
+ image_to_pixel_values_internvl(
image,
input_size=self.image_size,
min_num=self.min_num,
@@ -815,14 +817,15 @@ def __call__(
videos: npt.NDArray | list[npt.NDArray] = None,
**kwargs,
):
- from vllm.model_executor.models.internvl import (
- IMG_CONTEXT,
- IMG_END,
- IMG_START,
+ from vllm.transformers_utils.processors.internvl import (
image_to_pixel_values_internvl,
video_to_pixel_values_internvl,
)
+ IMG_START = "
"
+ IMG_END = ""
+ IMG_CONTEXT = ""
+
images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None:
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 7f806064f6f8..7f5c27c8778b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -774,7 +774,8 @@ def check_available_online(
"rednote-hilab/dots.ocr", trust_remote_code=True
),
"Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo(
- "nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False
+ "nvidia/Eagle2.5-8B",
+ trust_remote_code=True,
),
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
diff --git a/vllm/model_executor/models/eagle2_5_vl.py b/vllm/model_executor/models/eagle2_5_vl.py
index 3e6182db586c..30b8173f19cf 100644
--- a/vllm/model_executor/models/eagle2_5_vl.py
+++ b/vllm/model_executor/models/eagle2_5_vl.py
@@ -16,7 +16,10 @@
from vllm.model_executor.models.siglip import SiglipVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
+from vllm.transformers_utils.processors.internvl import (
+ InternVLImageProcessor,
+ InternVLProcessor,
+)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (
@@ -68,12 +71,35 @@ class Eagle2_5_VLImageEmbeddingInputs(TensorSchema):
class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Eagle2.5-VL model."""
- def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor:
- return self.ctx.init_processor(
- Eagle2_5_VLProcessor,
- config=self.ctx.get_hf_config(),
+ def get_image_processor(self, **kwargs):
+ config = self.get_hf_config()
+ vision_config = config.vision_config
+
+ kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+ kwargs.setdefault(
+ "image_size", config.force_image_size or vision_config.image_size
+ )
+ kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+ kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+ kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+ kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+
+ return InternVLImageProcessor(**kwargs)
+
+ def get_hf_processor(self, **kwargs) -> InternVLProcessor:
+ config = self.get_hf_config()
+ vision_config = config.vision_config
+
+ image_processor = self.get_image_processor(**kwargs)
+ image_size = image_processor.image_size
+ patch_size = vision_config.patch_size
+ downsample_ratio = config.downsample_ratio
+ image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+ return InternVLProcessor(
tokenizer=self.get_tokenizer(),
- **kwargs,
+ image_processor=image_processor,
+ image_seq_length=image_seq_length,
)
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 4434d10369e9..83af8ea86cd9 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -395,13 +395,13 @@ def get_image_processor(self, **kwargs):
vision_config = config.vision_config
image_size = vision_config["image_size"]
+ kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("size", {"width": image_size, "height": image_size})
return GLM4VImageProcessorFast(**kwargs)
def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
- return self.ctx.init_processor(
- GLM4VProcessor,
+ return GLM4VProcessor(
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(**kwargs),
)
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 3b01985c4458..bc9b2cc74e64 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -28,7 +28,7 @@
PromptUpdate,
TimingContext,
)
-from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor
+from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor
from .intern_vit import InternVisionModel
from .internvl import (
@@ -40,12 +40,34 @@
class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
+ def get_image_processor(self, **kwargs):
+ config = self.get_hf_config()
+ vision_config = config.vision_config
+
+ kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+ kwargs.setdefault("image_size", vision_config.image_size)
+ kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+ kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+ kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+ kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+ kwargs.setdefault("use_msac", config.use_msac)
+
+ return H2OVLImageProcessor(**kwargs)
+
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
- return self.ctx.init_processor(
- H2OVLProcessor,
- config=self.get_hf_config(),
+ config = self.get_hf_config()
+ vision_config = config.vision_config
+
+ image_processor = self.get_image_processor(**kwargs)
+ image_size = image_processor.image_size
+ patch_size = vision_config.patch_size
+ downsample_ratio = config.downsample_ratio
+ image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+ return H2OVLProcessor(
tokenizer=self.get_tokenizer(),
- **kwargs,
+ image_processor=image_processor,
+ image_seq_length=image_seq_length,
)
def get_num_image_tokens(
@@ -106,7 +128,7 @@ def get_replacement_internvl(item_idx: int):
if num_patches is not None:
assert isinstance(num_patches, int)
- return hf_processor.get_image_repl(feature_size, num_patches)
+ return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [
PromptReplacement(
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 8126391b269e..3c33da212f1d 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -9,6 +9,7 @@
# --------------------------------------------------------
from abc import abstractmethod
from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
from typing import Annotated, Literal, TypeAlias, TypeVar
import torch
@@ -45,8 +46,9 @@
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.internvl import (
- BaseInternVLProcessor,
+ InternVLImageProcessor,
InternVLProcessor,
+ InternVLVideoProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models."""
@abstractmethod
- def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
+ def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
@@ -134,7 +136,7 @@ def get_num_image_tokens(
*,
image_width: int,
image_height: int,
- processor: BaseInternVLProcessor,
+ processor: InternVLProcessor,
) -> int:
return processor.get_num_image_tokens(
image_width=image_width,
@@ -143,8 +145,9 @@ def get_num_image_tokens(
def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor()
+ image_processor = processor.image_processor
- base_size = processor.image_size
+ base_size = image_processor.image_size
target_ratios = processor.resolve_target_ratios()
largest_feature_size, largest_feature_pinpoint = 0, None
@@ -226,7 +229,7 @@ def _call_hf_processor(
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
- image_token_id = hf_processor.image_token_id
+ image_token_id = hf_processor.ctx_image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
@@ -291,7 +294,7 @@ def get_replacement_internvl(item_idx: int):
if num_patches is not None:
assert isinstance(num_patches, int)
- return hf_processor.get_image_repl(feature_size, num_patches)
+ return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [
PromptReplacement(
@@ -305,23 +308,73 @@ def get_replacement_internvl(item_idx: int):
class InternVLProcessingInfo(BaseInternVLProcessingInfo):
"""InternVL ProcessingInfo extended for video processing"""
- @property
- def supports_video(self):
- return self.get_hf_processor().supports_video
+ def get_image_processor(self, **kwargs):
+ config = self.get_hf_config()
+ vision_config = config.vision_config
- def get_supported_mm_limits(self):
- video_limit = {"video": None} if self.supports_video else {}
- return {**super().get_supported_mm_limits(), **video_limit}
+ kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+ kwargs.setdefault("image_size", vision_config.image_size)
+ kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+ kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+ kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+ kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+
+ return InternVLImageProcessor(**kwargs)
+
+ def get_video_processor(self, **kwargs):
+ config = self.get_hf_config()
+ vision_config = config.vision_config
- def get_video_token(self) -> str | None:
+ kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+ kwargs.setdefault("image_size", vision_config.image_size)
+
+ return InternVLVideoProcessor(**kwargs)
+
+ @cached_property
+ def ctx_video_token(self):
text_model_type = self.get_hf_config().get_text_config().model_type
- video_token_map = {
+ ctx_video_token_map = {
"qwen2": "<|video_pad|>",
"qwen3": "<|video_pad|>",
"qwen3_moe": "<|video_pad|>",
"gpt_oss": "<|reserved_200000|>",
}
- return video_token_map.get(text_model_type)
+
+ if text_model_type not in ctx_video_token_map:
+ return None
+
+ ctx_video_token = ctx_video_token_map[text_model_type]
+ if ctx_video_token not in self.get_tokenizer().get_vocab():
+ return None
+
+ return ctx_video_token
+
+ def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
+ config = self.get_hf_config()
+ vision_config = config.vision_config
+
+ image_processor = self.get_image_processor(**kwargs)
+ image_size = image_processor.image_size
+ patch_size = vision_config.patch_size
+ downsample_ratio = config.downsample_ratio
+ image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+ ctx_video_token = self.ctx_video_token
+ video_processor = (
+ self.get_video_processor(**kwargs) if ctx_video_token else None
+ )
+
+ return InternVLProcessor(
+ tokenizer=self.get_tokenizer(),
+ image_processor=image_processor,
+ video_processor=video_processor,
+ image_seq_length=image_seq_length,
+ ctx_video_token=ctx_video_token,
+ )
+
+ def get_supported_mm_limits(self):
+ video_limit = {"video": None} if self.ctx_video_token else {}
+ return {**super().get_supported_mm_limits(), **video_limit}
def get_num_frames_with_most_features(
self,
@@ -332,22 +385,14 @@ def get_num_frames_with_most_features(
max_videos = mm_counts.get("video", 0)
processor = self.get_hf_processor()
+ num_image_token = processor.image_seq_length
max_image_tokens = self.get_max_image_tokens() * max_images
- max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token
+ max_total_frames = (seq_len - max_image_tokens) // num_image_token
max_frames_per_video = max_total_frames // max(max_videos, 1)
return max(max_frames_per_video, 1)
- def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
- return self.ctx.init_processor(
- InternVLProcessor,
- config=self.get_hf_config(),
- tokenizer=self.get_tokenizer(),
- video_token=self.get_video_token(),
- **kwargs,
- )
-
class InternVLDummyInputsBuilder(
BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]
@@ -366,7 +411,7 @@ def get_dummy_mm_data(
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
- if self.info.supports_video:
+ if self.info.ctx_video_token:
config = self.info.get_hf_config()
image_size: int = config.vision_config.image_size
target_num_frames = self.info.get_num_frames_with_most_features(
@@ -405,11 +450,9 @@ def _call_hf_processor(
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
- if (
- self.info.supports_video
- and (video_token_id := hf_processor.video_token_id) is not None
- ):
+ if (video_token_id := hf_processor.ctx_video_token_id) is not None:
processed_outputs["video_token_id"] = torch.tensor(video_token_id)
+
return processed_outputs
def _get_mm_fields_config(
@@ -418,7 +461,7 @@ def _get_mm_fields_config(
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs)
- if self.info.supports_video:
+ if self.info.ctx_video_token:
video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
num_videos = len(video_num_patches)
video_fields = dict(
@@ -444,6 +487,8 @@ def _get_prompt_updates(
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
out_mm_kwargs=out_mm_kwargs,
)
+ if self.info.ctx_video_token is None:
+ return prompt_repl
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
@@ -456,26 +501,20 @@ def _get_prompt_updates(
video_num_patches = []
def get_video_replacement_internvl(item_idx: int):
- feature_size = hf_processor.num_image_token
num_patches = video_num_patches[item_idx]
if num_patches is not None:
assert isinstance(num_patches, int)
- return hf_processor.get_video_repl(
- feature_size, num_patches, video_context_token=hf_processor.video_token
- )
-
- if self.info.supports_video:
- prompt_repl = [
- *prompt_repl,
- PromptReplacement(
- modality="video",
- target="