diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index b8e31e274de4..fdc851493a62 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -489,13 +489,14 @@ def __init__(self, hf_runner: HfRunner): self.image_size = self.vision_config.image_size def __call__(self, text: str, images: Image | list[Image], **kwargs): - from vllm.model_executor.models.h2ovl import ( - IMG_CONTEXT, - IMG_END, - IMG_START, + from vllm.transformers_utils.processors.h2ovl import ( image_to_pixel_values_h2ovl, ) + IMG_START = "" + IMG_END = "" + IMG_CONTEXT = "" + images = [images] if isinstance(images, Image) else images pixel_values = [ image_to_pixel_values_h2ovl( @@ -751,16 +752,17 @@ def __init__(self, hf_runner: HfRunner): self.image_size = self.vision_config.image_size def __call__(self, text: str, images: Image | list[Image], **kwargs): - from vllm.model_executor.models.skyworkr1v import ( - IMG_CONTEXT, - IMG_END, - IMG_START, - image_to_pixel_values_skyworkr1v, + from vllm.transformers_utils.processors.internvl import ( + image_to_pixel_values_internvl, ) + IMG_START = "" + IMG_END = "" + IMG_CONTEXT = "" + images = [images] if isinstance(images, Image) else images pixel_values = [ - image_to_pixel_values_skyworkr1v( + image_to_pixel_values_internvl( image, input_size=self.image_size, min_num=self.min_num, @@ -815,14 +817,15 @@ def __call__( videos: npt.NDArray | list[npt.NDArray] = None, **kwargs, ): - from vllm.model_executor.models.internvl import ( - IMG_CONTEXT, - IMG_END, - IMG_START, + from vllm.transformers_utils.processors.internvl import ( image_to_pixel_values_internvl, video_to_pixel_values_internvl, ) + IMG_START = "" + IMG_END = "" + IMG_CONTEXT = "" + images = [images] if isinstance(images, Image) else images videos = [videos] if isinstance(videos, np.ndarray) else videos if images is not None: diff --git a/tests/models/registry.py b/tests/models/registry.py index 7f806064f6f8..7f5c27c8778b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -774,7 +774,8 @@ def check_available_online( "rednote-hilab/dots.ocr", trust_remote_code=True ), "Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo( - "nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False + "nvidia/Eagle2.5-8B", + trust_remote_code=True, ), "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo( diff --git a/vllm/model_executor/models/eagle2_5_vl.py b/vllm/model_executor/models/eagle2_5_vl.py index 3e6182db586c..30b8173f19cf 100644 --- a/vllm/model_executor/models/eagle2_5_vl.py +++ b/vllm/model_executor/models/eagle2_5_vl.py @@ -16,7 +16,10 @@ from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor +from vllm.transformers_utils.processors.internvl import ( + InternVLImageProcessor, + InternVLProcessor, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( @@ -68,12 +71,35 @@ class Eagle2_5_VLImageEmbeddingInputs(TensorSchema): class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo): """Processing info for Eagle2.5-VL model.""" - def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor: - return self.ctx.init_processor( - Eagle2_5_VLProcessor, - config=self.ctx.get_hf_config(), + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config + + kwargs = self.ctx.get_merged_mm_kwargs(kwargs) + kwargs.setdefault( + "image_size", config.force_image_size or vision_config.image_size + ) + kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) + kwargs.setdefault("use_thumbnail", config.use_thumbnail) + + return InternVLImageProcessor(**kwargs) + + def get_hf_processor(self, **kwargs) -> InternVLProcessor: + config = self.get_hf_config() + vision_config = config.vision_config + + image_processor = self.get_image_processor(**kwargs) + image_size = image_processor.image_size + patch_size = vision_config.patch_size + downsample_ratio = config.downsample_ratio + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) + + return InternVLProcessor( tokenizer=self.get_tokenizer(), - **kwargs, + image_processor=image_processor, + image_seq_length=image_seq_length, ) diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 4434d10369e9..83af8ea86cd9 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -395,13 +395,13 @@ def get_image_processor(self, **kwargs): vision_config = config.vision_config image_size = vision_config["image_size"] + kwargs = self.ctx.get_merged_mm_kwargs(kwargs) kwargs.setdefault("size", {"width": image_size, "height": image_size}) return GLM4VImageProcessorFast(**kwargs) def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor: - return self.ctx.init_processor( - GLM4VProcessor, + return GLM4VProcessor( tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor(**kwargs), ) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 3b01985c4458..bc9b2cc74e64 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -28,7 +28,7 @@ PromptUpdate, TimingContext, ) -from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor +from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor from .intern_vit import InternVisionModel from .internvl import ( @@ -40,12 +40,34 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo): + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config + + kwargs = self.ctx.get_merged_mm_kwargs(kwargs) + kwargs.setdefault("image_size", vision_config.image_size) + kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) + kwargs.setdefault("use_thumbnail", config.use_thumbnail) + kwargs.setdefault("use_msac", config.use_msac) + + return H2OVLImageProcessor(**kwargs) + def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: - return self.ctx.init_processor( - H2OVLProcessor, - config=self.get_hf_config(), + config = self.get_hf_config() + vision_config = config.vision_config + + image_processor = self.get_image_processor(**kwargs) + image_size = image_processor.image_size + patch_size = vision_config.patch_size + downsample_ratio = config.downsample_ratio + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) + + return H2OVLProcessor( tokenizer=self.get_tokenizer(), - **kwargs, + image_processor=image_processor, + image_seq_length=image_seq_length, ) def get_num_image_tokens( @@ -106,7 +128,7 @@ def get_replacement_internvl(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) - return hf_processor.get_image_repl(feature_size, num_patches) + return hf_processor.get_image_repl(num_patches, num_features=feature_size) return [ PromptReplacement( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 8126391b269e..3c33da212f1d 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -9,6 +9,7 @@ # -------------------------------------------------------- from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence +from functools import cached_property from typing import Annotated, Literal, TypeAlias, TypeVar import torch @@ -45,8 +46,9 @@ ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processors.internvl import ( - BaseInternVLProcessor, + InternVLImageProcessor, InternVLProcessor, + InternVLVideoProcessor, ) from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod - def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor: + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, int | None]: @@ -134,7 +136,7 @@ def get_num_image_tokens( *, image_width: int, image_height: int, - processor: BaseInternVLProcessor, + processor: InternVLProcessor, ) -> int: return processor.get_num_image_tokens( image_width=image_width, @@ -143,8 +145,9 @@ def get_num_image_tokens( def get_image_size_with_most_features(self) -> ImageSize: processor = self.get_hf_processor() + image_processor = processor.image_processor - base_size = processor.image_size + base_size = image_processor.image_size target_ratios = processor.resolve_target_ratios() largest_feature_size, largest_feature_pinpoint = 0, None @@ -226,7 +229,7 @@ def _call_hf_processor( ) hf_processor = self.info.get_hf_processor(**mm_kwargs) - image_token_id = hf_processor.image_token_id + image_token_id = hf_processor.ctx_image_token_id # Since there may be extra tokens in the feature placeholders, # we need to pass the image token ID to the model to select the @@ -291,7 +294,7 @@ def get_replacement_internvl(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) - return hf_processor.get_image_repl(feature_size, num_patches) + return hf_processor.get_image_repl(num_patches, num_features=feature_size) return [ PromptReplacement( @@ -305,23 +308,73 @@ def get_replacement_internvl(item_idx: int): class InternVLProcessingInfo(BaseInternVLProcessingInfo): """InternVL ProcessingInfo extended for video processing""" - @property - def supports_video(self): - return self.get_hf_processor().supports_video + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config - def get_supported_mm_limits(self): - video_limit = {"video": None} if self.supports_video else {} - return {**super().get_supported_mm_limits(), **video_limit} + kwargs = self.ctx.get_merged_mm_kwargs(kwargs) + kwargs.setdefault("image_size", vision_config.image_size) + kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) + kwargs.setdefault("use_thumbnail", config.use_thumbnail) + + return InternVLImageProcessor(**kwargs) + + def get_video_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config - def get_video_token(self) -> str | None: + kwargs = self.ctx.get_merged_mm_kwargs(kwargs) + kwargs.setdefault("image_size", vision_config.image_size) + + return InternVLVideoProcessor(**kwargs) + + @cached_property + def ctx_video_token(self): text_model_type = self.get_hf_config().get_text_config().model_type - video_token_map = { + ctx_video_token_map = { "qwen2": "<|video_pad|>", "qwen3": "<|video_pad|>", "qwen3_moe": "<|video_pad|>", "gpt_oss": "<|reserved_200000|>", } - return video_token_map.get(text_model_type) + + if text_model_type not in ctx_video_token_map: + return None + + ctx_video_token = ctx_video_token_map[text_model_type] + if ctx_video_token not in self.get_tokenizer().get_vocab(): + return None + + return ctx_video_token + + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: + config = self.get_hf_config() + vision_config = config.vision_config + + image_processor = self.get_image_processor(**kwargs) + image_size = image_processor.image_size + patch_size = vision_config.patch_size + downsample_ratio = config.downsample_ratio + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) + + ctx_video_token = self.ctx_video_token + video_processor = ( + self.get_video_processor(**kwargs) if ctx_video_token else None + ) + + return InternVLProcessor( + tokenizer=self.get_tokenizer(), + image_processor=image_processor, + video_processor=video_processor, + image_seq_length=image_seq_length, + ctx_video_token=ctx_video_token, + ) + + def get_supported_mm_limits(self): + video_limit = {"video": None} if self.ctx_video_token else {} + return {**super().get_supported_mm_limits(), **video_limit} def get_num_frames_with_most_features( self, @@ -332,22 +385,14 @@ def get_num_frames_with_most_features( max_videos = mm_counts.get("video", 0) processor = self.get_hf_processor() + num_image_token = processor.image_seq_length max_image_tokens = self.get_max_image_tokens() * max_images - max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token + max_total_frames = (seq_len - max_image_tokens) // num_image_token max_frames_per_video = max_total_frames // max(max_videos, 1) return max(max_frames_per_video, 1) - def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: - return self.ctx.init_processor( - InternVLProcessor, - config=self.get_hf_config(), - tokenizer=self.get_tokenizer(), - video_token=self.get_video_token(), - **kwargs, - ) - class InternVLDummyInputsBuilder( BaseInternVLDummyInputsBuilder[InternVLProcessingInfo] @@ -366,7 +411,7 @@ def get_dummy_mm_data( mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options) - if self.info.supports_video: + if self.info.ctx_video_token: config = self.info.get_hf_config() image_size: int = config.vision_config.image_size target_num_frames = self.info.get_num_frames_with_most_features( @@ -405,11 +450,9 @@ def _call_hf_processor( ) hf_processor = self.info.get_hf_processor(**mm_kwargs) - if ( - self.info.supports_video - and (video_token_id := hf_processor.video_token_id) is not None - ): + if (video_token_id := hf_processor.ctx_video_token_id) is not None: processed_outputs["video_token_id"] = torch.tensor(video_token_id) + return processed_outputs def _get_mm_fields_config( @@ -418,7 +461,7 @@ def _get_mm_fields_config( hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs) - if self.info.supports_video: + if self.info.ctx_video_token: video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0)) num_videos = len(video_num_patches) video_fields = dict( @@ -444,6 +487,8 @@ def _get_prompt_updates( hf_processor_mm_kwargs=hf_processor_mm_kwargs, out_mm_kwargs=out_mm_kwargs, ) + if self.info.ctx_video_token is None: + return prompt_repl hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) @@ -456,26 +501,20 @@ def _get_prompt_updates( video_num_patches = [] def get_video_replacement_internvl(item_idx: int): - feature_size = hf_processor.num_image_token num_patches = video_num_patches[item_idx] if num_patches is not None: assert isinstance(num_patches, int) - return hf_processor.get_video_repl( - feature_size, num_patches, video_context_token=hf_processor.video_token - ) - - if self.info.supports_video: - prompt_repl = [ - *prompt_repl, - PromptReplacement( - modality="video", - target="