|
56 | 56 | from vllm.config import VllmConfig |
57 | 57 | from vllm.distributed import (get_tensor_model_parallel_rank, |
58 | 58 | get_tensor_model_parallel_world_size) |
59 | | -from vllm.logger import init_logger |
60 | 59 | from vllm.model_executor.layers.fused_moe import FusedMoE |
61 | 60 | from vllm.model_executor.layers.logits_processor import LogitsProcessor |
62 | 61 | from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler |
|
70 | 69 | from vllm.model_executor.models.utils import merge_multimodal_embeddings |
71 | 70 | from vllm.model_executor.sampling_metadata import SamplingMetadata |
72 | 71 | from vllm.multimodal import MULTIMODAL_REGISTRY |
73 | | -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, |
74 | | - NestedTensors) |
| 72 | +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, |
| 73 | + MultiModalKwargs, NestedTensors) |
75 | 74 | from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, |
76 | 75 | MultiModalDataItems) |
77 | 76 | from vllm.multimodal.processing import (BaseMultiModalProcessor, |
78 | 77 | BaseProcessingInfo, PromptReplacement, |
79 | 78 | PromptUpdate) |
80 | | -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs |
| 79 | +from vllm.multimodal.profiling import BaseDummyInputsBuilder |
81 | 80 | from vllm.sequence import IntermediateTensors |
82 | 81 | from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig |
83 | 82 | from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config |
84 | 83 |
|
85 | 84 | from .utils import is_pp_missing_parameter, maybe_prefix |
86 | 85 |
|
87 | | -logger = init_logger(__name__) |
88 | | - |
89 | 86 |
|
90 | 87 | # For dummy input only |
91 | 88 | @dataclass |
@@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo): |
143 | 140 | def get_hf_config(self): |
144 | 141 | return self.ctx.get_hf_config(KimiVLConfig) |
145 | 142 |
|
| 143 | + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: |
| 144 | + return {"image": None} |
| 145 | + |
146 | 146 | def get_num_image_tokens( |
147 | 147 | self, |
148 | 148 | *, |
@@ -180,58 +180,35 @@ def get_num_image_tokens( |
180 | 180 | token_width = (width + pad_width) // (kernel_size[1] * patch_size) |
181 | 181 | return int(token_height * token_width) |
182 | 182 |
|
183 | | - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: |
184 | | - # None means unlimited |
185 | | - return {"image": None} |
186 | | - |
187 | | - def get_mm_max_tokens_per_item( |
188 | | - self, |
189 | | - seq_len: int, |
190 | | - mm_counts: Mapping[str, int], |
191 | | - ) -> Mapping[str, int]: |
192 | | - return { |
193 | | - "image": |
194 | | - self.get_num_image_tokens( |
195 | | - image_width=MaxImageTokenMeta.width, |
196 | | - image_height=MaxImageTokenMeta.height, |
197 | | - ), |
198 | | - } |
199 | | - |
200 | 183 | @property |
201 | 184 | def image_token_id(self) -> int: |
202 | 185 | return self.get_hf_config().media_placeholder_token_id |
203 | 186 |
|
204 | 187 |
|
205 | 188 | class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): |
206 | 189 |
|
207 | | - def __init__(self, info: KimiVLProcessingInfo) -> None: |
208 | | - super().__init__(info) |
| 190 | + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: |
| 191 | + num_images = mm_counts.get("image", 0) |
| 192 | + |
| 193 | + processor = self.info.get_hf_processor() |
| 194 | + image_token = processor.image_token |
209 | 195 |
|
210 | | - self.image_token_id = self.info.image_token_id |
211 | | - self.image_token = self.info.get_tokenizer().decode( |
212 | | - self.image_token_id) |
| 196 | + return image_token * num_images |
213 | 197 |
|
214 | | - def get_dummy_processor_inputs( |
| 198 | + def get_dummy_mm_data( |
215 | 199 | self, |
216 | 200 | seq_len: int, |
217 | 201 | mm_counts: Mapping[str, int], |
218 | | - ) -> ProcessorInputs: |
| 202 | + ) -> MultiModalDataDict: |
219 | 203 | num_images = mm_counts.get("image", 0) |
220 | 204 |
|
221 | | - width = MaxImageTokenMeta.width |
222 | | - height = MaxImageTokenMeta.height |
223 | | - mm_data = { |
| 205 | + return { |
224 | 206 | "image": |
225 | | - self._get_dummy_images(width=width, |
226 | | - height=height, |
| 207 | + self._get_dummy_images(width=MaxImageTokenMeta.width, |
| 208 | + height=MaxImageTokenMeta.height, |
227 | 209 | num_images=num_images) |
228 | 210 | } |
229 | 211 |
|
230 | | - return ProcessorInputs( |
231 | | - prompt_text=self.image_token * num_images, |
232 | | - mm_data=mm_data, |
233 | | - ) |
234 | | - |
235 | 212 |
|
236 | 213 | class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]): |
237 | 214 |
|
|
0 commit comments