diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 4476009fd271..b4ad610cdafb 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model="moonshotai/Kimi-VL-A3B-Instruct", - max_model_len=4096, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 52e938967066..e2e14d16228a 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: engine_args = EngineArgs( model=model_name, + trust_remote_code=True, max_model_len=4096, max_num_seqs=4, - tensor_parallel_size=1, limit_mm_per_prompt={"image": len(image_urls)}, - trust_remote_code=True, ) placeholders = [{"type": "image", "image": url} for url in image_urls] diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index c2fac70afc49..1520f6992f0a 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -56,7 +56,6 @@ from vllm.config import VllmConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -70,22 +69,20 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config from .utils import is_pp_missing_parameter, maybe_prefix -logger = init_logger(__name__) - # For dummy input only @dataclass @@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(KimiVLConfig) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + def get_num_image_tokens( self, *, @@ -180,23 +180,6 @@ def get_num_image_tokens( token_width = (width + pad_width) // (kernel_size[1] * patch_size) return int(token_height * token_width) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - # None means unlimited - return {"image": None} - - def get_mm_max_tokens_per_item( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> Mapping[str, int]: - return { - "image": - self.get_num_image_tokens( - image_width=MaxImageTokenMeta.width, - image_height=MaxImageTokenMeta.height, - ), - } - @property def image_token_id(self) -> int: return self.get_hf_config().media_placeholder_token_id @@ -204,34 +187,28 @@ def image_token_id(self) -> int: class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): - def __init__(self, info: KimiVLProcessingInfo) -> None: - super().__init__(info) + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token - self.image_token_id = self.info.image_token_id - self.image_token = self.info.get_tokenizer().decode( - self.image_token_id) + return image_token * num_images - def get_dummy_processor_inputs( + def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], - ) -> ProcessorInputs: + ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) - width = MaxImageTokenMeta.width - height = MaxImageTokenMeta.height - mm_data = { + return { "image": - self._get_dummy_images(width=width, - height=height, + self._get_dummy_images(width=MaxImageTokenMeta.width, + height=MaxImageTokenMeta.height, num_images=num_images) } - return ProcessorInputs( - prompt_text=self.image_token * num_images, - mm_data=mm_data, - ) - class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):