diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 7c64d14ca9d7..f34a6d36d750 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -44,6 +44,10 @@ maybe_prefix, ) from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.evs import ( + compute_retained_tokens_count, + compute_retention_mask, +) from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, @@ -62,13 +66,20 @@ PromptReplacement, PromptUpdate, PromptUpdateDetails, + _seq2tokens, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import ( + AnyTokenizer, + cached_tokenizer_from_config, + encode_tokens, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape +from .utils import _merge_multimodal_embeddings + # Configure PIL to handle large images without warnings # This prevents DecompressionBombWarning for legitimate large images Image.MAX_IMAGE_PIXELS = None # Disable the limit entirely @@ -382,6 +393,7 @@ def __init__( max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, video_token: Optional[str] = None, + video_pruning_rate: Optional[float] = None, ) -> None: super().__init__( config=config, @@ -392,6 +404,7 @@ def __init__( ) # add extra video token for video processing self.video_token = video_token + self.video_pruning_rate = video_pruning_rate @property def supports_video(self) -> bool: @@ -446,12 +459,38 @@ def _preprocess_video( ), } + image_size: int = self.config.force_image_size + patch_size: int = self.config.patch_size + downsample_ratio = self.config.downsample_ratio + tokens_per_frame = int( + (image_size * image_size // patch_size**2) * (downsample_ratio**2) + ) + for pixel_values in pixel_values_lst_video: - num_patches = pixel_values.shape[0] + num_frames = pixel_values.shape[0] + + if ( + self.video_pruning_rate is not None + and self.video_pruning_rate > 0.0 + ): + # Start of EVS-specific code + num_tokens = compute_retained_tokens_count( + tokens_per_frame=tokens_per_frame, + num_frames=num_frames, + q=self.video_pruning_rate, + ) + + # Here we just need placeholders that won't actually be replaced - + # we just need to make sure the total number of tokens is correct + # assign all tokens to the first frame + tokens_per_frame = [num_tokens] + [0] * (num_frames - 1) + + # End of EVS-specific code + else: + tokens_per_frame = [tokens_per_frame] * num_frames + + video_repl = self.get_video_repl(tokens_per_frame, self.video_token) - video_repl = self.get_video_repl( - self.num_image_token, num_patches, self.video_token - ) text = [t.replace("