From 644cc1ad52ba0ac9b4f1366207b20f29f2c5943f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 17 Mar 2026 16:47:01 +0000 Subject: [PATCH 01/13] [2/2] Refactor InternVL-based processors Signed-off-by: DarkLight1337 --- tests/models/registry.py | 3 +- vllm/model_executor/models/eagle2_5_vl.py | 38 +- vllm/model_executor/models/h2ovl.py | 32 +- vllm/model_executor/models/internvl.py | 87 ++- vllm/model_executor/models/nemotron_vl.py | 93 ++- vllm/model_executor/models/nvlm_d.py | 26 +- vllm/model_executor/models/skyworkr1v.py | 39 +- .../transformers_utils/processors/__init__.py | 4 - .../processors/eagle2_5_vl.py | 85 --- vllm/transformers_utils/processors/h2ovl.py | 159 +++--- .../transformers_utils/processors/internvl.py | 530 +++++++++--------- .../processors/nemotron_vl.py | 342 +++++------ vllm/transformers_utils/processors/nvlm_d.py | 44 +- .../processors/skyworkr1v.py | 389 ------------- 14 files changed, 792 insertions(+), 1079 deletions(-) delete mode 100644 vllm/transformers_utils/processors/eagle2_5_vl.py delete mode 100644 vllm/transformers_utils/processors/skyworkr1v.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 7f806064f6f8..7f5c27c8778b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -774,7 +774,8 @@ def check_available_online( "rednote-hilab/dots.ocr", trust_remote_code=True ), "Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo( - "nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False + "nvidia/Eagle2.5-8B", + trust_remote_code=True, ), "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo( diff --git a/vllm/model_executor/models/eagle2_5_vl.py b/vllm/model_executor/models/eagle2_5_vl.py index 3e6182db586c..239248a2a663 100644 --- a/vllm/model_executor/models/eagle2_5_vl.py +++ b/vllm/model_executor/models/eagle2_5_vl.py @@ -16,7 +16,10 @@ from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor +from vllm.transformers_utils.processors.internvl import ( + InternVLImageProcessor, + InternVLProcessor, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( @@ -68,12 +71,37 @@ class Eagle2_5_VLImageEmbeddingInputs(TensorSchema): class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo): """Processing info for Eagle2.5-VL model.""" - def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor: + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config + + kwargs.setdefault( + "image_size", config.force_image_size or vision_config.image_size + ) + kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) + kwargs.setdefault("use_thumbnail", config.use_thumbnail) + + return InternVLImageProcessor(**kwargs) + + def get_hf_processor(self, **kwargs) -> InternVLProcessor: + config = self.get_hf_config() + vision_config = config.vision_config + + image_processor = self.get_image_processor(**kwargs) + image_size = image_processor.image_size + patch_size = int(kwargs.get("patch_size", vision_config.patch_size)) + downsample_ratio = float( + kwargs.get("downsample_ratio", config.downsample_ratio) + ) + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) + return self.ctx.init_processor( - Eagle2_5_VLProcessor, - config=self.ctx.get_hf_config(), + InternVLProcessor, tokenizer=self.get_tokenizer(), - **kwargs, + image_processor=image_processor, + image_seq_length=image_seq_length, ) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 3b01985c4458..9d8e59832c5a 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -28,7 +28,7 @@ PromptUpdate, TimingContext, ) -from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor +from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor from .intern_vit import InternVisionModel from .internvl import ( @@ -40,12 +40,36 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo): + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config + + kwargs.setdefault("image_size", vision_config.image_size) + kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) + kwargs.setdefault("use_thumbnail", config.use_thumbnail) + kwargs.setdefault("use_msac", config.use_msac) + + return H2OVLImageProcessor(**kwargs) + def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: + config = self.get_hf_config() + vision_config = config.vision_config + + image_processor = self.get_image_processor(**kwargs) + image_size = image_processor.image_size + patch_size = int(kwargs.get("patch_size", vision_config.patch_size)) + downsample_ratio = float( + kwargs.get("downsample_ratio", config.downsample_ratio) + ) + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) + return self.ctx.init_processor( H2OVLProcessor, - config=self.get_hf_config(), tokenizer=self.get_tokenizer(), - **kwargs, + image_processor=image_processor, + image_seq_length=image_seq_length, ) def get_num_image_tokens( @@ -106,7 +130,7 @@ def get_replacement_internvl(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) - return hf_processor.get_image_repl(feature_size, num_patches) + return hf_processor.get_image_repl(num_patches, num_features=feature_size) return [ PromptReplacement( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 8126391b269e..0f1ba2084056 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -45,8 +45,10 @@ ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processors.internvl import ( - BaseInternVLProcessor, + InternVLImageProcessor, InternVLProcessor, + InternVLProcessorLike, + InternVLVideoProcessor, ) from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod - def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor: + def get_hf_processor(self, **kwargs: object) -> InternVLProcessorLike: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, int | None]: @@ -134,7 +136,7 @@ def get_num_image_tokens( *, image_width: int, image_height: int, - processor: BaseInternVLProcessor, + processor: InternVLProcessorLike, ) -> int: return processor.get_num_image_tokens( image_width=image_width, @@ -143,8 +145,9 @@ def get_num_image_tokens( def get_image_size_with_most_features(self) -> ImageSize: processor = self.get_hf_processor() + image_processor = processor.image_processor - base_size = processor.image_size + base_size = image_processor.image_size target_ratios = processor.resolve_target_ratios() largest_feature_size, largest_feature_pinpoint = 0, None @@ -291,7 +294,7 @@ def get_replacement_internvl(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) - return hf_processor.get_image_repl(feature_size, num_patches) + return hf_processor.get_image_repl(num_patches, num_features=feature_size) return [ PromptReplacement( @@ -305,15 +308,27 @@ def get_replacement_internvl(item_idx: int): class InternVLProcessingInfo(BaseInternVLProcessingInfo): """InternVL ProcessingInfo extended for video processing""" - @property - def supports_video(self): - return self.get_hf_processor().supports_video + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config - def get_supported_mm_limits(self): - video_limit = {"video": None} if self.supports_video else {} - return {**super().get_supported_mm_limits(), **video_limit} + kwargs.setdefault("image_size", vision_config.image_size) + kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) + kwargs.setdefault("use_thumbnail", config.use_thumbnail) + + return InternVLImageProcessor(**kwargs) + + def get_video_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config + + kwargs.setdefault("image_size", vision_config.image_size) - def get_video_token(self) -> str | None: + return InternVLVideoProcessor(**kwargs) + + def get_video_token(self): text_model_type = self.get_hf_config().get_text_config().model_type video_token_map = { "qwen2": "<|video_pad|>", @@ -323,6 +338,37 @@ def get_video_token(self) -> str | None: } return video_token_map.get(text_model_type) + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: + config = self.get_hf_config() + vision_config = config.vision_config + + image_processor = self.get_image_processor(**kwargs) + image_size = image_processor.image_size + patch_size = int(kwargs.get("patch_size", vision_config.patch_size)) + downsample_ratio = float( + kwargs.get("downsample_ratio", config.downsample_ratio) + ) + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) + + video_token = self.get_video_token() + + return self.ctx.init_processor( + InternVLProcessor, + tokenizer=self.get_tokenizer(), + image_processor=image_processor, + video_processor=self.get_video_processor(**kwargs) if video_token else None, + image_seq_length=image_seq_length, + video_token=video_token, + ) + + @property + def supports_video(self): + return self.get_video_token() is not None + + def get_supported_mm_limits(self): + video_limit = {"video": None} if self.supports_video else {} + return {**super().get_supported_mm_limits(), **video_limit} + def get_num_frames_with_most_features( self, seq_len: int, @@ -332,22 +378,14 @@ def get_num_frames_with_most_features( max_videos = mm_counts.get("video", 0) processor = self.get_hf_processor() + num_image_token = processor.image_seq_length max_image_tokens = self.get_max_image_tokens() * max_images - max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token + max_total_frames = (seq_len - max_image_tokens) // num_image_token max_frames_per_video = max_total_frames // max(max_videos, 1) return max(max_frames_per_video, 1) - def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: - return self.ctx.init_processor( - InternVLProcessor, - config=self.get_hf_config(), - tokenizer=self.get_tokenizer(), - video_token=self.get_video_token(), - **kwargs, - ) - class InternVLDummyInputsBuilder( BaseInternVLDummyInputsBuilder[InternVLProcessingInfo] @@ -456,14 +494,11 @@ def _get_prompt_updates( video_num_patches = [] def get_video_replacement_internvl(item_idx: int): - feature_size = hf_processor.num_image_token num_patches = video_num_patches[item_idx] if num_patches is not None: assert isinstance(num_patches, int) - return hf_processor.get_video_repl( - feature_size, num_patches, video_context_token=hf_processor.video_token - ) + return hf_processor.get_video_repl(num_patches) if self.info.supports_video: prompt_repl = [ diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 0b29eccee233..6fe363d055e7 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -26,8 +26,9 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processor import cached_image_processor_from_config from vllm.transformers_utils.processors.nemotron_vl import ( + LlamaNemotronNanoVLProcessor, + LlamaNemotronVLEmbedImageProcessor, LlamaNemotronVLEmbedProcessor, - NemotronVLProcessor, ) from vllm.transformers_utils.repo_utils import get_hf_file_to_dict @@ -50,19 +51,26 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): """Processing info for Nemotron VL models.""" - def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor: - return self.ctx.init_processor( - NemotronVLProcessor, - config=self.get_hf_config(), - tokenizer=self.get_tokenizer(), - image_processor=self.get_image_processor(), - **kwargs, + def get_image_processor(self, **kwargs: object): + return cached_image_processor_from_config(self.ctx.model_config, **kwargs) + + def get_hf_processor(self, **kwargs: object) -> LlamaNemotronNanoVLProcessor: + config = self.get_hf_config() + vision_config = config.vision_config + + image_processor = self.get_image_processor(**kwargs) + image_size = image_processor.image_size + patch_size = int(kwargs.get("patch_size", vision_config.patch_size)) + downsample_ratio = float( + kwargs.get("downsample_ratio", config.downsample_ratio) ) + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) - def get_image_processor(self, **kwargs: object): - return cached_image_processor_from_config( - self.ctx.model_config, - **kwargs, + return self.ctx.init_processor( + LlamaNemotronNanoVLProcessor, + tokenizer=self.get_tokenizer(), + image_processor=image_processor, + image_seq_length=image_seq_length, ) @@ -386,29 +394,60 @@ def get_mm_mapping(self) -> MultiModelKeys: # -------------------------------------------------------- -class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo): +class LlamaNemotronVLEmbedProcessingInfo(BaseInternVLProcessingInfo): """Processing info for LlamaNemotronVL embedding model.""" - def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor: - """Override to create embedding-specific processor without image_processor.""" + def get_image_processor(self, **kwargs): model_config = self.ctx.model_config - processor_config = {} - if model_config.model is not None: - processor_config = ( - get_hf_file_to_dict( - "processor_config.json", - model_config.model, - model_config.revision, - ) - or {} + + config = self.get_hf_config() + processor_config = ( + get_hf_file_to_dict( + "processor_config.json", + model_config.model, + model_config.revision, ) + or {} + ) + + min_dynamic_patch = processor_config.get( + "min_input_tiles", + getattr(config, "min_dynamic_patch", 1), + ) + max_dynamic_patch = processor_config.get( + "max_input_tiles", + getattr(config, "max_dynamic_patch", 1), + ) + dynamic_image_size = processor_config.get( + "dynamic_image_size", + getattr(config, "dynamic_image_size", True), + ) + + kwargs.setdefault("image_size", config.force_image_size) + kwargs.setdefault("min_dynamic_patch", min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", dynamic_image_size) + kwargs.setdefault("use_thumbnail", True) + + return LlamaNemotronVLEmbedImageProcessor(**kwargs) + + def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor: + config = self.get_hf_config() + vision_config = config.vision_config + + image_processor = self.get_image_processor(**kwargs) + image_size = image_processor.image_size + patch_size = int(kwargs.get("patch_size", vision_config.patch_size)) + downsample_ratio = float( + kwargs.get("downsample_ratio", config.downsample_ratio) + ) + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) return self.ctx.init_processor( LlamaNemotronVLEmbedProcessor, - config=self.get_hf_config(), tokenizer=self.get_tokenizer(), - processor_config=processor_config, - **kwargs, + image_processor=self.get_image_processor(**kwargs), + image_seq_length=image_seq_length, ) diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index d0061b3782fa..ec67b7ce80c0 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -27,7 +27,8 @@ PromptUpdate, PromptUpdateDetails, ) -from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor +from vllm.transformers_utils.processors.internvl import InternVLImageProcessor +from vllm.transformers_utils.processors.nvlm_d import NVLMProcessor from .intern_vit import InternVisionModel from .internvl import ( @@ -39,12 +40,25 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo): + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config + + kwargs.setdefault("image_size", vision_config.image_size) + kwargs.setdefault("patch_size", vision_config.patch_size) + kwargs.setdefault("downsample_ratio", config.downsample_ratio) + kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) + kwargs.setdefault("use_thumbnail", config.use_thumbnail) + + return InternVLImageProcessor(**kwargs) + def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: return self.ctx.init_processor( NVLMProcessor, - config=self.get_hf_config(), tokenizer=self.get_tokenizer(), - **kwargs, + image_processor=self.get_image_processor(**kwargs), ) @@ -117,9 +131,11 @@ def get_replacement_nvlm(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) - repl = hf_processor.get_image_repl(feature_size, num_patches) + repl = hf_processor.get_image_repl(num_patches, num_features=feature_size) - return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD) + return PromptUpdateDetails.select_text( + repl.full + "\n", hf_processor.image_token + ) # See note in dummy data regarding why we have the extra newline return [ diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index eed5bb1f7681..d3cb08095898 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -43,7 +43,10 @@ PromptUpdate, ) from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor +from vllm.transformers_utils.processors.internvl import ( + InternVLImageProcessor, + InternVLProcessor, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -96,12 +99,34 @@ class SkyworkR1VImageEmbeddingInputs(TensorSchema): class SkyworkR1VProcessingInfo(BaseProcessingInfo): - def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor: + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + vision_config = config.vision_config + + kwargs.setdefault("image_size", vision_config.image_size) + kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) + kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) + kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) + kwargs.setdefault("use_thumbnail", config.use_thumbnail) + + return InternVLImageProcessor(**kwargs) + + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: + config = self.get_hf_config() + vision_config = config.vision_config + + image_size = int(kwargs.get("image_size", vision_config.image_size)) + patch_size = int(kwargs.get("patch_size", vision_config.patch_size)) + downsample_ratio = float( + kwargs.get("downsample_ratio", config.downsample_ratio) + ) + image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) + return self.ctx.init_processor( - SkyworkR1VProcessor, - config=self.get_hf_config(), + InternVLProcessor, tokenizer=self.get_tokenizer(), - **kwargs, + image_processor=self.get_image_processor(**kwargs), + image_seq_length=image_seq_length, ) def get_supported_mm_limits(self) -> Mapping[str, int | None]: @@ -112,7 +137,7 @@ def get_num_image_tokens( *, image_width: int, image_height: int, - processor: SkyworkR1VProcessor, + processor: InternVLProcessor, ) -> int: return processor.get_num_image_tokens( image_width=image_width, @@ -252,7 +277,7 @@ def get_replacement_skyworkr1v(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) - return hf_processor.get_image_repl(feature_size, num_patches) + return hf_processor.get_image_repl(num_patches, num_features=feature_size) return [ PromptReplacement( diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 9c393b700627..068b59ffabef 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -13,7 +13,6 @@ __all__ = [ "BagelProcessor", "DeepseekVLV2Processor", - "Eagle2_5_VLProcessor", "FireRedASR2Processor", "FunASRProcessor", "GLM4VProcessor", @@ -33,13 +32,11 @@ "Ovis2_5Processor", "QwenVLProcessor", "Qwen3ASRProcessor", - "SkyworkR1VProcessor", ] _CLASS_TO_MODULE: dict[str, str] = { "BagelProcessor": "vllm.transformers_utils.processors.bagel", "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2", - "Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl", "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2", "FunASRProcessor": "vllm.transformers_utils.processors.funasr", "GLM4VProcessor": "vllm.transformers_utils.processors.glm4v", @@ -59,7 +56,6 @@ "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5", "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl", "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr", - "SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v", } diff --git a/vllm/transformers_utils/processors/eagle2_5_vl.py b/vllm/transformers_utils/processors/eagle2_5_vl.py deleted file mode 100644 index b3c37754b35e..000000000000 --- a/vllm/transformers_utils/processors/eagle2_5_vl.py +++ /dev/null @@ -1,85 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# Adapted from NVIDIA Eagle2.5-VL model -# https://huggingface.co/nvidia/Eagle2.5-8B -from transformers import PretrainedConfig - -from vllm.multimodal.processing import PromptUpdateDetails -from vllm.tokenizers import TokenizerLike - -from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor - - -class Eagle2_5_VLProcessor(BaseInternVLProcessor): - """ - Custom processor for Eagle2.5-VL model. - Extends BaseInternVLProcessor with Eagle-specific token handling. - """ - - def __init__( - self, - config: PretrainedConfig, - tokenizer: TokenizerLike, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - ) -> None: - # Skip super().__init__() to avoid config manipulation - # Directly initialize all required attributes - self.config = config - self.tokenizer = tokenizer - - # Image size with force_image_size override - image_size: int = config.vision_config.image_size - if hasattr(config, "force_image_size") and config.force_image_size: - image_size = config.force_image_size - - patch_size: int = config.vision_config.patch_size - downsample_ratio: float = getattr(config, "downsample_ratio", 0.5) - - # Compute num_image_token - self.num_image_token = int( - (image_size // patch_size) ** 2 * (downsample_ratio**2) - ) - self.image_size = image_size - - # Dynamic patch settings with defaults - self.min_dynamic_patch = ( - min_dynamic_patch - if min_dynamic_patch is not None - else getattr(config, "min_dynamic_patch", 1) - ) - self.max_dynamic_patch = ( - max_dynamic_patch - if max_dynamic_patch is not None - else getattr(config, "max_dynamic_patch", 12) - ) - self.dynamic_image_size = ( - dynamic_image_size - if dynamic_image_size is not None - else getattr(config, "dynamic_image_size", True) - ) - self.use_thumbnail: bool = getattr(config, "use_thumbnail", True) - - @property - def image_token_id(self) -> int: - """Get the image token ID from config or tokenizer.""" - if hasattr(self.config, "image_token_index"): - return self.config.image_token_index - # Fallback to tokenizer vocab - use (ID: 151667) - vocab = self.tokenizer.get_vocab() - if IMG_CONTEXT in vocab: - return vocab[IMG_CONTEXT] - raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary") - - def get_image_repl( - self, - feature_size: int, - num_patches: int | None, - ) -> PromptUpdateDetails[str]: - """Get image replacement string for prompt.""" - repl_features = IMG_CONTEXT * feature_size - repl_full = IMG_START + repl_features + IMG_END - - return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) diff --git a/vllm/transformers_utils/processors/h2ovl.py b/vllm/transformers_utils/processors/h2ovl.py index 2f256c75a945..3b30423bea89 100644 --- a/vllm/transformers_utils/processors/h2ovl.py +++ b/vllm/transformers_utils/processors/h2ovl.py @@ -10,16 +10,12 @@ # -------------------------------------------------------- import torch from PIL import Image -from transformers import PretrainedConfig -from vllm.multimodal.processing import PromptUpdateDetails from vllm.tokenizers import TokenizerLike from .internvl import ( - IMG_CONTEXT, - IMG_END, - IMG_START, - BaseInternVLProcessor, + InternVLImageProcessor, + InternVLProcessor, build_transform, find_closest_aspect_ratio, get_internvl_target_ratios, @@ -217,45 +213,26 @@ def image_to_pixel_values_h2ovl( return pixel_values -class H2OVLProcessor(BaseInternVLProcessor): +class H2OVLImageProcessor(InternVLImageProcessor): def __init__( self, - config: PretrainedConfig, - tokenizer: TokenizerLike, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - use_msac: bool | None = None, + image_size: int, + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, + use_thumbnail: bool, + use_msac: bool, ) -> None: super().__init__( - config, - tokenizer, + image_size=image_size, min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, ) - if use_msac is None: - use_msac = config.use_msac - assert isinstance(use_msac, bool) - self.use_msac = use_msac - @property - def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[IMG_CONTEXT] - - def get_image_repl( - self, - feature_size: int, - num_patches: int | None, - ) -> PromptUpdateDetails[str]: - repl_features = IMG_CONTEXT * feature_size - repl_full = IMG_START + repl_features + IMG_END - - return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) - def resolve_min_max_num( self, *, @@ -264,18 +241,14 @@ def resolve_min_max_num( dynamic_image_size: bool | None = None, use_thumbnail: bool | None = None, ) -> tuple[int, int]: - min_dynamic_patch = ( - self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch - ) - max_dynamic_patch = ( - self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch - ) - dynamic_image_size = ( - self.dynamic_image_size - if dynamic_image_size is None - else dynamic_image_size - ) - use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail + if min_dynamic_patch is None: + min_dynamic_patch = self.min_dynamic_patch + if max_dynamic_patch is None: + max_dynamic_patch = self.max_dynamic_patch + if dynamic_image_size is None: + dynamic_image_size = self.dynamic_image_size + if use_thumbnail is None: + use_thumbnail = self.use_thumbnail return resolve_h2ovl_min_max_num( min_dynamic_patch=min_dynamic_patch, @@ -284,6 +257,57 @@ def resolve_min_max_num( use_thumbnail=use_thumbnail, ) + def _images_to_pixel_values_lst( + self, + images: list[Image.Image], + min_dynamic_patch: int | None = None, + max_dynamic_patch: int | None = None, + dynamic_image_size: bool | None = None, + ) -> list[torch.Tensor]: + use_msac = self.use_msac if len(images) == 1 else False + + min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values + ) + + return [ + image_to_pixel_values_h2ovl( + image, + input_size=self.image_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=self.use_thumbnail, + use_msac=use_msac, + ) + for image in images + ] + + +class H2OVLProcessor(InternVLProcessor): + def __init__( + self, + image_processor: H2OVLImageProcessor, + tokenizer: TokenizerLike, + *, + image_seq_length: int, + image_token: str = "", + start_image_token: str = "", + end_image_token: str = "", + ) -> None: + super().__init__( + image_processor=image_processor, + tokenizer=tokenizer, + image_seq_length=image_seq_length, + image_token=image_token, + start_image_token=start_image_token, + end_image_token=end_image_token, + ) + + self.image_processor: H2OVLImageProcessor + def resolve_target_ratios( self, *, @@ -294,7 +318,7 @@ def resolve_target_ratios( prior_aspect_ratio: tuple[int, int] | None = None, override_min_num: int | None = None, ) -> list[tuple[int, int]]: - min_num, max_num = self.resolve_min_max_num( + min_num, max_num = self.image_processor.resolve_min_max_num( min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, @@ -316,9 +340,10 @@ def get_num_image_tokens( image_height: int, use_msac: bool | None = None, ) -> int: - use_msac = self.use_msac if use_msac is None else use_msac + image_processor = self.image_processor + use_msac = image_processor.use_msac if use_msac is None else use_msac - use_thumbnail = self.use_thumbnail + use_thumbnail = image_processor.use_thumbnail if use_msac: target_ratios_1 = self.resolve_target_ratios( @@ -328,7 +353,7 @@ def get_num_image_tokens( num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets( orig_width=image_width, orig_height=image_height, - image_size=self.image_size, + image_size=image_processor.image_size, target_ratios=target_ratios_1, use_thumbnail=True, ) @@ -341,7 +366,7 @@ def get_num_image_tokens( num_patches_2, _, _, _ = calculate_h2ovl_targets( orig_width=image_width, orig_height=image_height, - image_size=self.image_size, + image_size=image_processor.image_size, target_ratios=target_ratios_2, use_thumbnail=True, ) @@ -354,37 +379,9 @@ def get_num_image_tokens( num_patches, _, _, _ = calculate_h2ovl_targets( orig_width=image_width, orig_height=image_height, - image_size=self.image_size, + image_size=image_processor.image_size, target_ratios=target_ratios, use_thumbnail=use_thumbnail, ) - return num_patches * self.num_image_token - - def _images_to_pixel_values_lst( - self, - images: list[Image.Image], - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - ) -> list[torch.Tensor]: - use_msac = self.use_msac if len(images) == 1 else False - - min_num, max_num = self.resolve_min_max_num( - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - use_thumbnail=False, # Applied in image_to_pixel_values - ) - - return [ - image_to_pixel_values_h2ovl( - image, - input_size=self.image_size, - min_num=min_num, - max_num=max_num, - use_thumbnail=self.use_thumbnail, - use_msac=use_msac, - ) - for image in images - ] + return num_patches * self.image_seq_length diff --git a/vllm/transformers_utils/processors/internvl.py b/vllm/transformers_utils/processors/internvl.py index b5c231cb48ee..84342584b5df 100644 --- a/vllm/transformers_utils/processors/internvl.py +++ b/vllm/transformers_utils/processors/internvl.py @@ -7,25 +7,19 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -from abc import ABC, abstractmethod -from typing import Any, TypeVar +from typing import Protocol import numpy.typing as npt import torch import torchvision.transforms as T from PIL import Image -from transformers import BatchFeature, PretrainedConfig, TensorType +from transformers import BatchFeature, TensorType +from transformers.processing_utils import ProcessorMixin from vllm.multimodal.image import convert_image_mode from vllm.multimodal.processing import PromptUpdateDetails from vllm.tokenizers import TokenizerLike -_T = TypeVar("_T") - -IMG_START = "" -IMG_END = "" -IMG_CONTEXT = "" - IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) @@ -33,7 +27,7 @@ # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD - transform = T.Compose( + return T.Compose( [ T.Lambda(lambda img: convert_image_mode(img, "RGB")), T.Resize( @@ -43,7 +37,6 @@ def build_transform(input_size: int): T.Normalize(mean=MEAN, std=STD), ] ) - return transform # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B @@ -223,65 +216,20 @@ def video_to_pixel_values_internvl( return pixel_values -class BaseInternVLProcessor(ABC): - """ - This model doesn't define its own HF processor, - so we implement our own one here. - - The code to insert image tokens is based on: - https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252 - """ - +class InternVLImageProcessor: def __init__( self, - config: PretrainedConfig, - tokenizer: TokenizerLike, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, + image_size: int, + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, + use_thumbnail: bool, ) -> None: - super().__init__() - - self.config = config - self.tokenizer = tokenizer - - image_size: int = config.vision_config.image_size - patch_size: int = config.vision_config.patch_size - - if min_dynamic_patch is None: - min_dynamic_patch = config.min_dynamic_patch - assert isinstance(min_dynamic_patch, int) - - if max_dynamic_patch is None: - max_dynamic_patch = config.max_dynamic_patch - assert isinstance(max_dynamic_patch, int) - - if dynamic_image_size is None: - dynamic_image_size = config.dynamic_image_size - assert isinstance(dynamic_image_size, bool) - - self.num_image_token = int( - (image_size // patch_size) ** 2 * (config.downsample_ratio**2) - ) self.image_size = image_size self.min_dynamic_patch = min_dynamic_patch self.max_dynamic_patch = max_dynamic_patch self.dynamic_image_size = dynamic_image_size - self.use_thumbnail: bool = config.use_thumbnail - - @property - @abstractmethod - def image_token_id(self) -> int: - raise NotImplementedError - - @abstractmethod - def get_image_repl( - self, - feature_size: int, - num_patches: int | None, - ) -> PromptUpdateDetails[str]: - raise NotImplementedError + self.use_thumbnail = use_thumbnail def resolve_min_max_num( self, @@ -291,18 +239,14 @@ def resolve_min_max_num( dynamic_image_size: bool | None = None, use_thumbnail: bool | None = None, ) -> tuple[int, int]: - min_dynamic_patch = ( - self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch - ) - max_dynamic_patch = ( - self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch - ) - dynamic_image_size = ( - self.dynamic_image_size - if dynamic_image_size is None - else dynamic_image_size - ) - use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail + if min_dynamic_patch is None: + min_dynamic_patch = self.min_dynamic_patch + if max_dynamic_patch is None: + max_dynamic_patch = self.max_dynamic_patch + if dynamic_image_size is None: + dynamic_image_size = self.dynamic_image_size + if use_thumbnail is None: + use_thumbnail = self.use_thumbnail return resolve_internvl_min_max_num( min_dynamic_patch=min_dynamic_patch, @@ -311,43 +255,6 @@ def resolve_min_max_num( use_thumbnail=use_thumbnail, ) - def resolve_target_ratios( - self, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - use_thumbnail: bool | None = None, - ) -> list[tuple[int, int]]: - min_num, max_num = self.resolve_min_max_num( - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - use_thumbnail=use_thumbnail, - ) - - return get_internvl_target_ratios(min_num, max_num) - - def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - target_ratios = self.resolve_target_ratios( - use_thumbnail=False, # Applied in calculate_targets - ) - - num_patches, _, _ = calculate_internvl_targets( - orig_width=image_width, - orig_height=image_height, - image_size=self.image_size, - target_ratios=target_ratios, - use_thumbnail=self.use_thumbnail, - ) - - return num_patches * self.num_image_token - def _images_to_pixel_values_lst( self, images: list[Image.Image], @@ -355,7 +262,14 @@ def _images_to_pixel_values_lst( max_dynamic_patch: int | None = None, dynamic_image_size: bool | None = None, ) -> list[torch.Tensor]: - min_num, max_num = self.resolve_min_max_num( + if min_dynamic_patch is None: + min_dynamic_patch = self.min_dynamic_patch + if max_dynamic_patch is None: + max_dynamic_patch = self.max_dynamic_patch + if dynamic_image_size is None: + dynamic_image_size = self.dynamic_image_size + + min_num, max_num = resolve_internvl_min_max_num( min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, @@ -373,49 +287,9 @@ def _images_to_pixel_values_lst( for image in images ] - def _preprocess_image( - self, - text: list[str], - images: list[Image.Image], - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - ) -> tuple[list[str], dict[str, torch.Tensor]]: - if len(images) == 0: - image_inputs = {} - else: - pixel_values_lst = self._images_to_pixel_values_lst( - images, - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - image_inputs = { - "pixel_values_flat": torch.cat(pixel_values_lst), - "image_num_patches": torch.tensor( - [len(item) for item in pixel_values_lst] - ), - } - - for pixel_values in pixel_values_lst: - num_patches = pixel_values.shape[0] - feature_size = num_patches * self.num_image_token - - image_repl = self.get_image_repl(feature_size, num_patches) - text = [t.replace("", image_repl.full, 1) for t in text] - return text, image_inputs - - def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]: - if input_item is None: - input_item = [] - if not isinstance(input_item, list): - input_item = [input_item] - return input_item - def __call__( self, - text: str | list[str] | None = None, - images: Image.Image | list[Image.Image] | None = None, + images: Image.Image | list[Image.Image], *, min_dynamic_patch: int | None = None, max_dynamic_patch: int | None = None, @@ -423,120 +297,204 @@ def __call__( return_tensors: str | TensorType | None = None, **kwargs, ) -> BatchFeature: - text = self._make_batch_input(text) - images = self._make_batch_input(images) + images_lst = [images] if not isinstance(images, list) else images - text, image_inputs = self._preprocess_image( - text=text, - images=images, + pixel_values_lst = self._images_to_pixel_values_lst( + images_lst, min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, ) - text_inputs = self.tokenizer(text) + image_inputs = { + "pixel_values_flat": torch.cat(pixel_values_lst), + "image_num_patches": torch.tensor([len(item) for item in pixel_values_lst]), + } + return BatchFeature(image_inputs, tensor_type=return_tensors) - combined_outputs = {**text_inputs, **image_inputs} - return BatchFeature(combined_outputs, tensor_type=return_tensors) +class InternVLVideoProcessor: + def __init__( + self, + image_size: int, + ) -> None: + self.image_size = image_size + + def _videos_to_pixel_values_lst( + self, + videos: list[npt.NDArray], + ) -> list[torch.Tensor]: + return [ + video_to_pixel_values_internvl( + video, + input_size=self.image_size, + min_num=1, + max_num=1, + use_thumbnail=False, + ) + for video in videos + ] + + def __call__( + self, + videos: npt.NDArray | list[npt.NDArray], + *, + return_tensors: str | TensorType | None = None, + **kwargs, + ) -> BatchFeature: + videos_lst = [videos] if not isinstance(videos, list) else videos + + pixel_values_lst = self._videos_to_pixel_values_lst(videos_lst) + + image_inputs = { + "pixel_values_flat_video": torch.cat(pixel_values_lst), + "video_num_patches": torch.tensor([len(item) for item in pixel_values_lst]), + } + return BatchFeature(image_inputs, tensor_type=return_tensors) + + +class InternVLProcessorLike(Protocol): + image_seq_length: int + image_token: str + image_token_id: int + start_image_token: str + start_image_token_id: int + end_image_token: str + end_image_token_id: int + + def resolve_target_ratios(self) -> list[tuple[int, int]]: ... + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: ... + + def get_image_repl( + self, + num_patches: int | None, + num_features: int | None = None, + ) -> PromptUpdateDetails[str]: ... -class InternVLProcessor(BaseInternVLProcessor): +class InternVLProcessor(InternVLProcessorLike, ProcessorMixin): """ - HF Processor for InternVLChatModel with extended video processing logic. + This model doesn't define its own HF processor, + so we implement our own one here. + + The code to insert image tokens is based on: + https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252 Code for video processing is adapted from video example: https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers """ + attributes = ["image_processor", "tokenizer", "video_processor"] + def __init__( self, - config: PretrainedConfig, + image_processor: InternVLImageProcessor, tokenizer: TokenizerLike, + video_processor: InternVLVideoProcessor | None = None, *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, + image_seq_length: int, + image_token: str = "", + start_image_token: str = "", + end_image_token: str = "", video_token: str | None = None, ) -> None: - super().__init__( - config=config, - tokenizer=tokenizer, - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - # add extra video token for video processing - self.video_token = video_token + self.image_processor = image_processor + self.tokenizer = tokenizer + self.video_processor = video_processor - @property - def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[IMG_CONTEXT] + self.image_seq_length = image_seq_length + self.image_token = image_token + self.start_image_token = start_image_token + self.end_image_token = end_image_token + self.video_token = video_token - @property - def video_token_id(self) -> int | None: - if self.video_token is None: - return None - return self.tokenizer.get_vocab().get(self.video_token, None) + self.image_token_id = tokenizer.convert_tokens_to_ids(image_token) + self.start_image_token_id = tokenizer.convert_tokens_to_ids(start_image_token) + self.end_image_token_id = tokenizer.convert_tokens_to_ids(end_image_token) + self.video_token_id = ( + None + if video_token is None + else tokenizer.convert_tokens_to_ids(video_token) + ) @property def supports_video(self) -> bool: return self.video_token_id is not None - def _videos_to_pixel_values_lst( + def resolve_target_ratios( self, - videos: list[npt.NDArray], + *, + min_dynamic_patch: int | None = None, + max_dynamic_patch: int | None = None, dynamic_image_size: bool | None = None, - ) -> list[torch.Tensor]: - min_num, max_num = self.resolve_min_max_num( - min_dynamic_patch=1, - max_dynamic_patch=1, + use_thumbnail: bool | None = None, + ) -> list[tuple[int, int]]: + min_num, max_num = self.image_processor.resolve_min_max_num( + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, - use_thumbnail=False, # Applied in image_to_pixel_values + use_thumbnail=use_thumbnail, ) - return [ - video_to_pixel_values_internvl( - video, - input_size=self.image_size, - min_num=min_num, - max_num=max_num, - use_thumbnail=False, - ) - for video in videos - ] + return get_internvl_target_ratios(min_num, max_num) - def _preprocess_video( + def get_num_image_tokens( self, - text: list[str], - videos: list[npt.NDArray], - dynamic_image_size: bool | None = None, - ) -> tuple[list[str], dict[str, Any]]: - if len(videos) == 0 or not self.supports_video: - return text, {} - - video_token = self.video_token - assert video_token is not None + *, + image_width: int, + image_height: int, + ) -> int: + image_processor = self.image_processor + target_ratios = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + ) - pixel_values_lst_video = self._videos_to_pixel_values_lst( - videos, - dynamic_image_size=dynamic_image_size, + num_patches, _, _ = calculate_internvl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=image_processor.image_size, + target_ratios=target_ratios, + use_thumbnail=image_processor.use_thumbnail, ) - video_inputs = { - "pixel_values_flat_video": torch.cat(pixel_values_lst_video), - "video_num_patches": torch.tensor( - [len(item) for item in pixel_values_lst_video] - ), - } - for pixel_values in pixel_values_lst_video: - num_patches = pixel_values.shape[0] + return num_patches * self.image_seq_length - video_repl = self.get_video_repl( - self.num_image_token, num_patches, video_token - ) - text = [t.replace("