From a3bd4e2c89c261b50b44919066eccb92d43c6ee7 Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Wed, 8 Oct 2025 11:32:09 +0300
Subject: [PATCH 01/14] Allow passing "mm_processor_kwargs":
 dict(max_num_tiles=2),

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 .../model_executor/models/nano_nemotron_vl.py | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 039ffbddf8db..7c6b82a4b313 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -93,6 +93,7 @@
 
 # Profiling
 MAX_FRAMES = 16
+DEFAULT_NUM_TILES = 12
 
 
 class NanoNemotronVLImagePixelInputs(TypedDict):
@@ -255,13 +256,19 @@ class BaseNanoNemotronVLProcessor(ABC):
     """
 
     def __init__(
-        self, config: PretrainedConfig, tokenizer: AnyTokenizer, *args, **kwargs
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *args,
+        max_num_tiles: Optional[int] = None,
+        **kwargs,
     ) -> None:
         super().__init__()
 
         self.config = config
         self.tokenizer = tokenizer
 
+        self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
         image_size: int = config.force_image_size
         patch_size: int = config.patch_size
 
@@ -361,7 +368,7 @@ def __call__(
     ) -> BatchFeature:
         # Use default if not provided
         if max_num_tiles is None:
-            max_num_tiles = 12
+            max_num_tiles = self.max_num_tiles
 
         text, images = [self._make_batch_input(x) for x in (text, images)]
 
@@ -390,6 +397,7 @@ def __init__(
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
+        max_num_tiles: Optional[int] = None,
         min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
@@ -399,6 +407,7 @@ def __init__(
         super().__init__(
             config=config,
             tokenizer=tokenizer,
+            max_num_tiles=max_num_tiles,
             min_dynamic_patch=min_dynamic_patch,
             max_dynamic_patch=max_dynamic_patch,
             dynamic_image_size=dynamic_image_size,
@@ -506,7 +515,7 @@ def __call__(
     ) -> BatchFeature:
         # Use default if not provided
         if max_num_tiles is None:
-            max_num_tiles = 12
+            max_num_tiles = self.max_num_tiles
 
         text, images, videos = [
             self._make_batch_input(x) for x in (text, images, videos)
@@ -635,7 +644,7 @@ def get_image_size_with_most_features(self, max_num_tiles: int) -> ImageSize:
     def get_max_image_tokens(self) -> int:
         processor = self.get_hf_processor()
         # Use default max_num_tiles for max tokens calculation
-        max_num_tiles = 12
+        max_num_tiles = processor.max_num_tiles
         target_width, target_height = self.get_image_size_with_most_features(
             max_num_tiles
         )
@@ -768,7 +777,9 @@ def get_replacement_custom(item_idx: int):
             else:
                 image_size = images.get_image_size(item_idx)
                 # Extract max_num_tiles from kwargs, default to 12
-                max_num_tiles = hf_processor_mm_kwargs.get("max_num_tiles", 12)
+                max_num_tiles = hf_processor_mm_kwargs.get(
+                    "max_num_tiles", hf_processor.max_num_tiles
+                )
                 feature_size = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,

From 16568945e0635f06580ae69554ac095b9f008fea Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Wed, 8 Oct 2025 11:53:39 +0300
Subject: [PATCH 02/14] Ensure video modality always uses 1 tile (performance
 optimization)

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 vllm/model_executor/models/nano_nemotron_vl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 7c6b82a4b313..91dfa6735534 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -228,6 +228,8 @@ def video_to_pixel_values(
     max_num_tiles: int = 1,
     use_thumbnail: bool,
 ) -> torch.Tensor:
+    assert max_num_tiles == 1, "Video modality always uses one tile"
+
     # Convert each frame to a single resized tile tensor consistent
     # with image path
     frames_tensors: list[torch.Tensor] = []
@@ -530,7 +532,7 @@ def __call__(
         text, video_inputs = self._preprocess_video(
             text=text,
             videos=videos,
-            max_num_tiles=max_num_tiles,
+            max_num_tiles=1,
             dynamic_image_size=dynamic_image_size,
         )
 

From e8562b8d2b94ec18e8004d7a29890e1193ed7d46 Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Sun, 12 Oct 2025 20:04:23 +0300
Subject: [PATCH 03/14] Cherry pick video loading support

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 vllm/multimodal/video.py | 72 ++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 400d6a6be9be..7e5ba917be71 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -6,7 +6,7 @@
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -175,6 +175,16 @@ def load_bytes(
         max_duration: int = 300,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Args:
+            num_frames (int): Maximum number of frames to load.
+            A total sampled number of frames will never be larger
+            than this value. Set it -1 to remove the upper limit.
+
+            fps (int): Desired video sampling rate. A real samping
+            rate may be lower if we encounter long video and
+            num_frames upper limit is set to positive value.
+        """
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -183,36 +193,42 @@ def load_bytes(
             raise ValueError("Could not open video stream")
 
         total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        original_fps = cap.get(cv2.CAP_PROP_FPS)
-        duration = total_frames_num / original_fps if original_fps > 0 else 0
+        if total_frames_num == 0:
+            raise ValueError("CAP_PROP_FRAME_COUNT returned 0")
 
-        # resample video to target num_frames
-        max_frame_idx = total_frames_num - 1
-        duration = duration or round(max_frame_idx / original_fps) + 1
-
-        # Refer to:
-        # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
-        frame_indices: Union[range, list[int]]
-        if duration <= max_duration:
-            n = int(math.floor(duration * fps))
-            frame_indices = sorted(
-                {
-                    min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
-                    for i in range(n)
-                }
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        if not (original_fps > 0):
+            print(
+                f"WARNING: CAP_PROP_FPS returned {original_fps}. "
+                f"We will use 30 FPS as default fallback."
             )
+            original_fps = 30
+
+        duration = total_frames_num / original_fps
+
+        # Determine target number of samples
+        if num_frames > 0:
+            # Hard upper bound
+            max_samples = int(num_frames)
         else:
-            num_samples = int(max_duration * fps)
-            if num_samples >= total_frames_num:
-                frame_indices = range(total_frames_num)
-            else:
-                target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
-                frame_indices = sorted(
-                    {
-                        min(max_frame_idx, int(math.ceil(t * original_fps)))
-                        for t in target_seconds
-                    }
-                )
+            # No cap -> sample at desired fps
+            max_samples = int(max(1, math.floor(duration * fps)))
+
+        # Clamp to available frames if count is known
+        max_samples = max(1, min(max_samples, total_frames_num))
+
+        # Uniform coverage of the entire timeline within the cap
+        # Use linspace over [0, total_frames-1]
+        raw = np.linspace(0, total_frames_num - 1, max_samples, endpoint=True)
+        frame_indices = np.unique(raw.round().astype(int)).tolist()
+
+        effective_fps = len(frame_indices) / duration
+        print(
+            f"Video [{total_frames_num} fames]({duration:.2f}sec "
+            f"at {original_fps:.2f}fps) sampled "
+            f"into frame [{len(frame_indices)}] indexes {frame_indices} "
+            f"at {effective_fps:.2f}fps."
+        )
 
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

From 5809b49ed2751ccbcd0950aca3b9fc2632735b0f Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Sat, 11 Oct 2025 23:32:52 +0300
Subject: [PATCH 04/14] Cherry-pick image normalization

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 .../model_executor/models/nano_nemotron_vl.py | 12 +++++-
 vllm/model_executor/models/radio.py           | 38 +++----------------
 2 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 91dfa6735534..dfe77ee22fda 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -248,6 +248,10 @@ def video_to_pixel_values(
     return torch.stack(frames_tensors)
 
 
+def input_conditioner(x, norm_mean, norm_std):
+    return (x - norm_mean) / norm_std
+
+
 class BaseNanoNemotronVLProcessor(ABC):
     """
     This model doesn't define its own HF processor,
@@ -341,7 +345,9 @@ def _preprocess_image(
         else:
             pixel_values_lst = self._images_to_pixel_values_lst(images, max_num_tiles)
             image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "pixel_values_flat": input_conditioner(
+                    torch.cat(pixel_values_lst), self.norm_mean, self.norm_std
+                ),
                 "image_num_patches": torch.tensor(
                     [len(item) for item in pixel_values_lst]
                 ),
@@ -465,7 +471,9 @@ def _preprocess_video(
             )
 
             video_inputs = {
-                "pixel_values_flat_video": torch.cat(pixel_values_lst_video),
+                "pixel_values_flat_video": input_conditioner(
+                    torch.cat(pixel_values_lst_video), self.norm_mean, self.norm_std
+                ),
                 "video_num_patches": torch.tensor(
                     [len(item) for item in pixel_values_lst_video]
                 ),
diff --git a/vllm/model_executor/models/radio.py b/vllm/model_executor/models/radio.py
index 2313b98348b7..03d56eead6e0 100644
--- a/vllm/model_executor/models/radio.py
+++ b/vllm/model_executor/models/radio.py
@@ -43,32 +43,6 @@ def parse(x):
 to_ntuple = _ntuple
 
 
-class InputConditioner(nn.Module):
-    def __init__(
-        self,
-        input_scale: float,
-        norm_mean: norm_t,
-        norm_std: norm_t,
-        dtype: torch.dtype = None,
-    ):
-        super().__init__()
-
-        self.dtype = dtype
-
-        self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale)
-        self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale)
-
-    def forward(self, x: torch.Tensor):
-        y = (x - self.norm_mean) / self.norm_std
-        if self.dtype is not None:
-            y = y.to(self.dtype)
-        return y
-
-
-def _to_tensor(v: norm_t):
-    return torch.as_tensor(v, dtype=torch.float32).view(-1, 1, 1)
-
-
 class ClsToken(nn.Module):
     def __init__(
         self,
@@ -507,11 +481,6 @@ def __init__(
         super().__init__()
 
         self.config = config
-        self.input_conditioner = InputConditioner(
-            input_scale=1.0,
-            norm_mean=config.norm_mean,
-            norm_std=config.norm_std,
-        )
         self.model = RadioInternVisionModel(
             config=config,
             quant_config=quant_config,
@@ -525,8 +494,7 @@ def forward(
         pixel_values: Optional[torch.Tensor] = None,
         pixel_embeds: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
-        x = self.input_conditioner(pixel_values)
-        y = self.model(x)
+        y = self.model(pixel_values)
         return self._extract_final(y)
 
     def load_weights(self, weights) -> set[str]:
@@ -548,6 +516,10 @@ def load_weights(self, weights) -> set[str]:
             # Skip buffers not used in vLLM
             if sub in {"summary_idxs"}:
                 continue
+            if sub.startswith("input_conditioner."):
+                # we normalize in the input processor,
+                # based on norm and std values from the config
+                continue
 
             vllm_key = None
             if sub.startswith("model.patch_generator."):

From 44f364be8f869ccabbf3349baa04343c097f0b57 Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Sun, 12 Oct 2025 20:04:23 +0300
Subject: [PATCH 05/14] Add video timestamps support Fix image normalization
 precision bug Fix possible inconsistent tokenization of frame prefixes Change
 logic of video sampling in OpenCVDynamicBackend Signed-off-by: Eugene
 Khvedchenia <ekhvedchenia@nvidia.com>

---
 .../model_executor/models/nano_nemotron_vl.py | 283 +++++++++++++++---
 vllm/model_executor/models/radio.py           |  38 +--
 vllm/model_executor/models/utils.py           |   1 -
 vllm/multimodal/video.py                      |  72 +++--
 4 files changed, 293 insertions(+), 101 deletions(-)

diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 91dfa6735534..e9d757a78e04 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -14,6 +14,7 @@
 from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union
 
 import numpy.typing as npt
+import regex as re
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
@@ -21,7 +22,7 @@
 from transformers import BatchFeature, PretrainedConfig, TensorType
 
 from vllm.config import VllmConfig
-from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.model_executor.layers.activation import ReLUSquaredActivation
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -54,12 +55,14 @@
     MultiModalFieldConfig,
     MultiModalKwargs,
     MultiModalKwargsItems,
+    VideoItem,
 )
 from vllm.multimodal.parse import (
     ImageEmbeddingItems,
     ImageProcessorItems,
     ImageSize,
     MultiModalDataItems,
+    MultiModalDataParser,
 )
 from vllm.multimodal.processing import (
     BaseMultiModalProcessor,
@@ -128,7 +131,8 @@ class NanoNemotronVLVideoPixelInputs(TensorSchema):
     """
     Dimensions:
         - bvf: Batch size * number of videos * num_frames
-        - bn: Batch size * number of images
+        - bn: Batch size * number of videos
+        - f: Number of frames
         - c: Number of channels (3)
         - h: Height of each video frame
         - w: Width of each video frame
@@ -137,6 +141,8 @@ class NanoNemotronVLVideoPixelInputs(TensorSchema):
     type: Literal["pixel_values_videos"]
     pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+    frames_indices: Annotated[torch.Tensor, TensorShape("bvf")]
+    frame_duration_ms: Annotated[torch.Tensor, TensorShape("bn")]
 
 
 class NanoNemotronVLVideoEmbeddingInputs(TensorSchema):
@@ -248,6 +254,21 @@ def video_to_pixel_values(
     return torch.stack(frames_tensors)
 
 
+def input_conditioner(x, norm_mean, norm_std):
+    return (x - norm_mean) / norm_std
+
+
+def calculate_timestamps(
+    indices: list[int] | torch.Tensor,
+    frame_duration_ms: int,
+):
+    if not isinstance(indices, list):
+        indices = indices.tolist()
+
+    timestamps = [int(i) * frame_duration_ms / 1000.0 for i in indices]
+    return timestamps
+
+
 class BaseNanoNemotronVLProcessor(ABC):
     """
     This model doesn't define its own HF processor,
@@ -341,17 +362,30 @@ def _preprocess_image(
         else:
             pixel_values_lst = self._images_to_pixel_values_lst(images, max_num_tiles)
             image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "pixel_values_flat": input_conditioner(
+                    torch.cat(pixel_values_lst), self.norm_mean, self.norm_std
+                ),
                 "image_num_patches": torch.tensor(
                     [len(item) for item in pixel_values_lst]
                 ),
             }
 
-            for pixel_values in pixel_values_lst:
+            assert len(text) == 1, (
+                "hf_processor is called on the output of get_dummy_text, "
+                "which should be a single string"
+            )
+            parts = [x for x in re.split(r"(<image>)", text[0]) if x]
+            assert parts.count("<image>") == len(pixel_values_lst), (
+                "the number of <image> tokens in the text should be the "
+                "same as the number of images"
+            )
+
+            for i, pixel_values in enumerate(pixel_values_lst):
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                text = [t.replace("<image>", image_repl.full, 1) for t in text]
+                parts[i] = parts[i].replace("<image>", image_repl.full)
+            text = ["".join(parts)]
         return text, image_inputs
 
     def _make_batch_input(self, input_item: Optional[Union[Any, list[Any]]] = None):
@@ -418,6 +452,18 @@ def __init__(
         self.video_token = video_token
         self.video_pruning_rate = video_pruning_rate
 
+        # Pre-tokenize special tokens for video processing
+        # to avoid repeated tokenization
+        self._img_start_token_ids = encode_tokens(
+            tokenizer, IMG_START, add_special_tokens=False
+        )
+        self._img_end_token_ids = encode_tokens(
+            tokenizer, IMG_END, add_special_tokens=False
+        )
+        self._img_context_token_ids = encode_tokens(
+            tokenizer, IMG_CONTEXT, add_special_tokens=False
+        )
+
     @property
     def supports_video(self) -> bool:
         return self.video_token_id is not None
@@ -451,24 +497,43 @@ def _videos_to_pixel_values_lst(
     def _preprocess_video(
         self,
         text: list[str],
-        videos: list[npt.NDArray],
+        videos: list[tuple[npt.NDArray, dict[str, Any]]],
         max_num_tiles: int,
         dynamic_image_size: Optional[bool] = None,
     ):
         if len(videos) == 0 or not self.supports_video:
             video_inputs = {}
         else:
+            videos_lst = [v[0] for v in videos]
+            video_metadata_lst = [v[1] for v in videos]
             pixel_values_lst_video = self._videos_to_pixel_values_lst(
-                videos,
+                videos_lst,
                 max_num_tiles=max_num_tiles,
                 dynamic_image_size=dynamic_image_size,
             )
 
+            # We use frame duration in milliseconds (as integer) to ensure
+            # we have consistent timestamps calculation. At preprocessing
+            # fps parameter is given in fp32, while at inference it is bf16
+            # which leads to inaccurate timestamp calculation and causes
+            # timestamp values to differ.In rare cases this causes
+            # mismatching number of output tokens for tokenized  frame prefixes
+            frame_duration_ms_lst = [
+                int(1000.0 / metadata["fps"]) for metadata in video_metadata_lst
+            ]
+            frames_indices_lst = [
+                metadata["frames_indices"] for metadata in video_metadata_lst
+            ]
+
             video_inputs = {
-                "pixel_values_flat_video": torch.cat(pixel_values_lst_video),
+                "pixel_values_flat_video": input_conditioner(
+                    torch.cat(pixel_values_lst_video), self.norm_mean, self.norm_std
+                ),
                 "video_num_patches": torch.tensor(
                     [len(item) for item in pixel_values_lst_video]
                 ),
+                "frames_indices": frames_indices_lst,
+                "frame_duration_ms": torch.tensor(frame_duration_ms_lst),
             }
 
             image_size: int = self.config.force_image_size
@@ -478,7 +543,12 @@ def _preprocess_video(
                 (image_size * image_size // patch_size**2) * (downsample_ratio**2)
             )
 
-            for pixel_values in pixel_values_lst_video:
+            for pixel_values, video_metadata, frames_indices, frame_duration_ms in zip(
+                pixel_values_lst_video,
+                video_metadata_lst,
+                frames_indices_lst,
+                frame_duration_ms_lst,
+            ):
                 num_frames = pixel_values.shape[0]
 
                 if (
@@ -501,16 +571,29 @@ def _preprocess_video(
                 else:
                     tokens_per_frame = [tokens_in_single_frame] * num_frames
 
-                video_repl = self.get_video_repl(tokens_per_frame, self.video_token)
+                video_repl = self.get_video_repl(
+                    tokens_per_frame=tokens_per_frame,
+                    frames_indices=frames_indices,
+                    frame_duration_ms=frame_duration_ms,
+                    tokenizer=self.tokenizer,
+                    img_start_token_ids=self._img_start_token_ids,
+                    img_end_token_ids=self._img_end_token_ids,
+                    img_context_token_ids=self._img_context_token_ids,
+                )
 
-                text = [t.replace("<video>", video_repl.full, 1) for t in text]
+                # video_repl.full is a list of token IDs
+                # Convert token IDs back to text for the HF processor flow
+                video_repl_text = self.tokenizer.decode(
+                    video_repl.full, skip_special_tokens=False
+                )
+                text = [t.replace("<video>", video_repl_text, 1) for t in text]
         return text, video_inputs
 
     def __call__(
         self,
         text: Optional[Union[str, list[str]]] = None,
         images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
+        videos: Optional[list[tuple[npt.NDArray, dict[str, Any]]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         max_num_tiles: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
@@ -555,9 +638,15 @@ def get_image_repl(
     @classmethod
     def get_video_repl(
         cls,
+        *,
         tokens_per_frame: list[int],
-        video_context_token: str = IMG_CONTEXT,
-    ) -> PromptUpdateDetails[str]:
+        frames_indices: list[int],
+        frame_duration_ms: int,
+        tokenizer: AnyTokenizer,
+        img_start_token_ids: list[int],
+        img_end_token_ids: list[int],
+        img_context_token_ids: list[int],
+    ) -> PromptUpdateDetails[list[int]]:
         """
         Build prompt replacement for a video.
         The replacement returned is not actually used to replace the placeholder
@@ -576,16 +665,52 @@ def get_video_repl(
         - EVS real (called from get_real_video_repl_for_evs) - different value per frame
         Args:
             tokens_per_frame (list[int]): number of tokens per frame
-            video_context_token (str): the token to use for the video context
+            frames_indices (list[int]): frame indices
+            frame_duration_ms (int): duration of each frame in milliseconds
+            tokenizer (AnyTokenizer): tokenizer to use for tokenizing frame separators
+            img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
+            img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
+            img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
         """
-        repl_full = "".join(
-            [
-                f"Frame{i + 1}: {IMG_START}{video_context_token * num_tokens}{IMG_END}"
-                for i, num_tokens in enumerate(tokens_per_frame)
+        # TODO: Add support of frame_duration_ms to be None
+        # At preprocessing step we should allow absent / metadata without
+        # frames_indices field.
+        timestamps_enabled = frame_duration_ms is not None
+
+        if timestamps_enabled:
+            timestamps = calculate_timestamps(frames_indices, frame_duration_ms)
+
+            assert len(timestamps) == len(tokens_per_frame), (
+                "timestamps and tokens_per_frame must have the same length"
+            )
+            frame_separators = [
+                f"Frame {i + 1} sampled at {timestamp:.2f} seconds: "
+                for i, timestamp in enumerate(timestamps)
             ]
-        )
+        else:
+            frame_separators = [
+                f"Frame {i + 1}: " for i, _ in enumerate(tokens_per_frame)
+            ]
+
+        # Tokenize frame separator independently
+        frame_separators_tokenized = [
+            _seq2tokens(tokenizer, sep) for sep in frame_separators
+        ]
+
+        # Tokenize each component independently to avoid tokenizer merging tokens
+        # across boundaries. This ensures consistent tokenization regardless of
+        # num_tokens_per_frame values.
+        all_token_ids = []
+        for i, num_tokens in enumerate(tokens_per_frame):
+            frame_sep_token_ids = frame_separators_tokenized[i]
+            all_token_ids.extend(frame_sep_token_ids)
+
+            # Add pre-tokenized special tokens
+            all_token_ids.extend(img_start_token_ids)
+            all_token_ids.extend(img_context_token_ids * num_tokens)
+            all_token_ids.extend(img_end_token_ids)
 
-        return PromptUpdateDetails.from_seq(repl_full)
+        return PromptUpdateDetails.from_seq(all_token_ids)
 
 
 class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
@@ -814,6 +939,9 @@ class NanoNemotronVLMultiModalProcessor(
 ):
     """MultiModalProcessor extended for video support"""
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(video_needs_metadata=True)
+
     def _call_hf_processor(
         self,
         prompt: str,
@@ -848,6 +976,8 @@ def _get_mm_fields_config(
                 ),
                 video_num_patches=MultiModalFieldConfig.batched("video"),
                 video_token_id=MultiModalFieldConfig.shared("video", num_videos),
+                frames_indices=MultiModalFieldConfig.batched("video"),
+                frame_duration_ms=MultiModalFieldConfig.batched("video"),
             )
         else:
             video_fields = {}
@@ -878,6 +1008,7 @@ def _get_prompt_updates(
 
         def get_video_replacement_internvl(item_idx: int):
             feature_size = hf_processor.num_image_token
+            video, metadata = mm_items["video"][item_idx]
             num_patches = video_num_patches[item_idx]
             if num_patches is not None:
                 assert isinstance(num_patches, int)
@@ -899,9 +1030,15 @@ def get_video_replacement_internvl(item_idx: int):
             else:
                 tokens_per_frame = [feature_size] * num_patches
 
+            frame_duration_ms = int(1000 / metadata["fps"])
             return hf_processor.get_video_repl(
-                tokens_per_frame,
-                video_context_token=hf_processor.video_token,
+                tokens_per_frame=tokens_per_frame,
+                frames_indices=metadata["frames_indices"],
+                frame_duration_ms=frame_duration_ms,
+                tokenizer=hf_processor.tokenizer,
+                img_start_token_ids=hf_processor._img_start_token_ids,
+                img_end_token_ids=hf_processor._img_end_token_ids,
+                img_context_token_ids=hf_processor._img_context_token_ids,
             )
 
         if self.info.supports_video:
@@ -960,6 +1097,37 @@ def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
 
         return super().get_dummy_text(mm_counts) + "<video>" * num_videos
 
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+        overrides: Optional[VideoDummyOptions] = None,
+    ) -> list[VideoItem]:
+        video = super()._get_dummy_videos(
+            width=width,
+            height=height,
+            num_frames=num_frames,
+            num_videos=1,
+            overrides=overrides,
+        )[0]
+        video_items = []
+        for _ in range(num_videos):
+            video_metadata = {
+                "total_num_frames": num_frames,
+                "fps": 2,
+                "duration": num_frames / 2.0,
+                "video_backend": "opencv_dynamic",
+                "frames_indices": [i for i in range(num_frames)],
+                "do_sample_frames": False,
+            }
+            video_item = (video.copy(), video_metadata)
+            video_items.append(video_item)
+
+        return video_items
+
     def get_dummy_mm_data(
         self,
         seq_len: int,
@@ -1056,6 +1224,19 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.model_config = vllm_config.model_config
 
+        # Pre-tokenize special tokens for video processing
+        # to avoid repeated tokenization
+        tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+        self._img_start_token_ids = encode_tokens(
+            tokenizer, IMG_START, add_special_tokens=False
+        )
+        self._img_end_token_ids = encode_tokens(
+            tokenizer, IMG_END, add_special_tokens=False
+        )
+        self._img_context_token_ids = encode_tokens(
+            tokenizer, IMG_CONTEXT, add_special_tokens=False
+        )
+
     def pixel_shuffle(self, x, scale_factor=0.5):
         n, w, h, c = x.size()
         # N, W, H, C --> N, W, H * scale, C // scale
@@ -1185,12 +1366,15 @@ def _process_video_input(
         rows = int(image_rows * downsample_ratio // patch_size)
         cols = int(image_cols * downsample_ratio // patch_size)
         video_pruning_rate = self.video_pruning_rate
-
+        video_num_frames = video_input["num_patches"].tolist()
+        video_frames_indices = video_input["frames_indices"].split(video_num_frames)
         # Calculate video feature dimensions (number of frames and
         # their feature size (AKA tokens per frame))
         # TODO: Maybe this can be optimized to avoid the loop?
         for i, single_video_embeddings in enumerate(video_embeddings):
-            num_frames = video_input["num_patches"][i].item()
+            num_frames = video_num_frames[i]
+            frames_indices = video_frames_indices[i].tolist()
+            frame_duration_ms = video_input["frame_duration_ms"][i].item()
             assert single_video_embeddings.shape[0] % num_frames == 0
 
             if video_pruning_rate is not None and video_pruning_rate > 0.0:
@@ -1219,6 +1403,8 @@ def _process_video_input(
                 self._create_final_video_embeddings(
                     single_video_embeddings,
                     num_tokens_per_frame,
+                    frames_indices,
+                    frame_duration_ms,
                 ),
             )
 
@@ -1228,6 +1414,8 @@ def _create_final_video_embeddings(
         self,
         video_embeddings: torch.Tensor,
         num_tokens_per_frame: list[int],
+        frames_indices: list[int],
+        frame_duration_ms: int,
     ) -> torch.Tensor:
         """Create final embeddings that combine video embeddings with
         text embeddings of indicator tokens.
@@ -1241,23 +1429,28 @@ def _create_final_video_embeddings(
         input_embeds for the LLM.
         """
         device = video_embeddings.device
-
-        # Generate video replacement text and convert to token IDs
-        video_repl_text = NanoNemotronVLProcessor.get_video_repl(
-            num_tokens_per_frame,
-            IMG_CONTEXT,
-        ).full
-
         tokenizer = cached_tokenizer_from_config(self.model_config)
-        repl_token_ids = torch.tensor(
-            _seq2tokens(tokenizer, video_repl_text), device=device
-        )
 
-        # Get embedding token IDs for image context
-        embed_token_ids = torch.tensor(
-            encode_tokens(tokenizer, IMG_CONTEXT), device=device
+        # Generate video replacement token IDs using get_video_repl
+        # This tokenizes each frame separator independently, then uses pre-tokenized
+        # special tokens to ensure consistent tokenization regardless of
+        # num_tokens_per_frame values.
+        video_repl = NanoNemotronVLProcessor.get_video_repl(
+            tokens_per_frame=num_tokens_per_frame,
+            frames_indices=frames_indices,
+            frame_duration_ms=frame_duration_ms,
+            tokenizer=tokenizer,
+            img_start_token_ids=self._img_start_token_ids,
+            img_end_token_ids=self._img_end_token_ids,
+            img_context_token_ids=self._img_context_token_ids,
         )
 
+        # video_repl.full is a list of token IDs
+        repl_token_ids = torch.tensor(video_repl.full, device=device)
+
+        # Get embedding token IDs for image context (use pre-tokenized version)
+        embed_token_ids = torch.tensor(self._img_context_token_ids, device=device)
+
         # Create mask for video embedding positions
         is_video_embed = torch.isin(repl_token_ids, embed_token_ids)
 
@@ -1278,6 +1471,8 @@ def _parse_and_validate_video_input(
         pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
         video_embeds = kwargs.pop("video_embeds", None)
+        frames_indices = kwargs.pop("frames_indices", None)
+        frame_duration_ms = kwargs.pop("frame_duration_ms", None)
 
         if pixel_values_flat_video is None and video_embeds is None:
             return None
@@ -1307,13 +1502,23 @@ def _parse_and_validate_video_input(
 
             pixel_values_flat_video = flatten_bn(pixel_values_flat_video, concat=True)
             video_num_patches = flatten_bn(video_num_patches, concat=True)
+
+            if torch.is_tensor(frames_indices):
+                frames_indices = frames_indices.flatten()
+            else:
+                frames_indices = torch.cat([f.flatten() for f in frames_indices], dim=0)
+
+            frame_duration_ms = frame_duration_ms.flatten()
             expected_h = expected_w = self.config.force_image_size
-            resolve_bindings = {"h": expected_h, "w": expected_w}
+            num_frames = video_num_patches[0].item()
+            resolve_bindings = {"h": expected_h, "w": expected_w, "f": num_frames}
 
             return NanoNemotronVLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_flat=pixel_values_flat_video,
                 num_patches=video_num_patches,
+                frames_indices=frames_indices,
+                frame_duration_ms=frame_duration_ms,
                 resolve_bindings=resolve_bindings,
             )
 
diff --git a/vllm/model_executor/models/radio.py b/vllm/model_executor/models/radio.py
index 2313b98348b7..03d56eead6e0 100644
--- a/vllm/model_executor/models/radio.py
+++ b/vllm/model_executor/models/radio.py
@@ -43,32 +43,6 @@ def parse(x):
 to_ntuple = _ntuple
 
 
-class InputConditioner(nn.Module):
-    def __init__(
-        self,
-        input_scale: float,
-        norm_mean: norm_t,
-        norm_std: norm_t,
-        dtype: torch.dtype = None,
-    ):
-        super().__init__()
-
-        self.dtype = dtype
-
-        self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale)
-        self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale)
-
-    def forward(self, x: torch.Tensor):
-        y = (x - self.norm_mean) / self.norm_std
-        if self.dtype is not None:
-            y = y.to(self.dtype)
-        return y
-
-
-def _to_tensor(v: norm_t):
-    return torch.as_tensor(v, dtype=torch.float32).view(-1, 1, 1)
-
-
 class ClsToken(nn.Module):
     def __init__(
         self,
@@ -507,11 +481,6 @@ def __init__(
         super().__init__()
 
         self.config = config
-        self.input_conditioner = InputConditioner(
-            input_scale=1.0,
-            norm_mean=config.norm_mean,
-            norm_std=config.norm_std,
-        )
         self.model = RadioInternVisionModel(
             config=config,
             quant_config=quant_config,
@@ -525,8 +494,7 @@ def forward(
         pixel_values: Optional[torch.Tensor] = None,
         pixel_embeds: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
-        x = self.input_conditioner(pixel_values)
-        y = self.model(x)
+        y = self.model(pixel_values)
         return self._extract_final(y)
 
     def load_weights(self, weights) -> set[str]:
@@ -548,6 +516,10 @@ def load_weights(self, weights) -> set[str]:
             # Skip buffers not used in vLLM
             if sub in {"summary_idxs"}:
                 continue
+            if sub.startswith("input_conditioner."):
+                # we normalize in the input processor,
+                # based on norm and std values from the config
+                continue
 
             vllm_key = None
             if sub.startswith("model.patch_generator."):
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 2a64f6865f12..a501753d2ebd 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -432,7 +432,6 @@ def _merge_multimodal_embeddings(
     try:
         # For debugging
         # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype)
-
         # NOTE: This can avoid D2H sync (#22105), but fails to
         # raise an error if is_multimodal.sum() < len(mm_embeds_flat)
         inputs_embeds.masked_scatter_(
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 400d6a6be9be..7e5ba917be71 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -6,7 +6,7 @@
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -175,6 +175,16 @@ def load_bytes(
         max_duration: int = 300,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Args:
+            num_frames (int): Maximum number of frames to load.
+            A total sampled number of frames will never be larger
+            than this value. Set it -1 to remove the upper limit.
+
+            fps (int): Desired video sampling rate. A real samping
+            rate may be lower if we encounter long video and
+            num_frames upper limit is set to positive value.
+        """
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -183,36 +193,42 @@ def load_bytes(
             raise ValueError("Could not open video stream")
 
         total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        original_fps = cap.get(cv2.CAP_PROP_FPS)
-        duration = total_frames_num / original_fps if original_fps > 0 else 0
+        if total_frames_num == 0:
+            raise ValueError("CAP_PROP_FRAME_COUNT returned 0")
 
-        # resample video to target num_frames
-        max_frame_idx = total_frames_num - 1
-        duration = duration or round(max_frame_idx / original_fps) + 1
-
-        # Refer to:
-        # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
-        frame_indices: Union[range, list[int]]
-        if duration <= max_duration:
-            n = int(math.floor(duration * fps))
-            frame_indices = sorted(
-                {
-                    min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
-                    for i in range(n)
-                }
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        if not (original_fps > 0):
+            print(
+                f"WARNING: CAP_PROP_FPS returned {original_fps}. "
+                f"We will use 30 FPS as default fallback."
             )
+            original_fps = 30
+
+        duration = total_frames_num / original_fps
+
+        # Determine target number of samples
+        if num_frames > 0:
+            # Hard upper bound
+            max_samples = int(num_frames)
         else:
-            num_samples = int(max_duration * fps)
-            if num_samples >= total_frames_num:
-                frame_indices = range(total_frames_num)
-            else:
-                target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
-                frame_indices = sorted(
-                    {
-                        min(max_frame_idx, int(math.ceil(t * original_fps)))
-                        for t in target_seconds
-                    }
-                )
+            # No cap -> sample at desired fps
+            max_samples = int(max(1, math.floor(duration * fps)))
+
+        # Clamp to available frames if count is known
+        max_samples = max(1, min(max_samples, total_frames_num))
+
+        # Uniform coverage of the entire timeline within the cap
+        # Use linspace over [0, total_frames-1]
+        raw = np.linspace(0, total_frames_num - 1, max_samples, endpoint=True)
+        frame_indices = np.unique(raw.round().astype(int)).tolist()
+
+        effective_fps = len(frame_indices) / duration
+        print(
+            f"Video [{total_frames_num} fames]({duration:.2f}sec "
+            f"at {original_fps:.2f}fps) sampled "
+            f"into frame [{len(frame_indices)}] indexes {frame_indices} "
+            f"at {effective_fps:.2f}fps."
+        )
 
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

From 73ac36b1e23e608e506b051cec8b47ec3e5a1cd3 Mon Sep 17 00:00:00 2001
From: Natan Bagrov <nbagrov@nvidia.com>
Date: Thu, 16 Oct 2025 00:47:52 -0700
Subject: [PATCH 06/14] bugfix: respect fps in case it implies less frames

---
 vllm/multimodal/video.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 7e5ba917be71..3be0607b4772 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -171,7 +171,7 @@ def load_bytes(
         cls,
         data: bytes,
         num_frames: int = -1,
-        fps: int = 2,
+        fps: int = -1,
         max_duration: int = 300,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
@@ -205,17 +205,15 @@ def load_bytes(
             original_fps = 30
 
         duration = total_frames_num / original_fps
+        
+        # Determine target number of samples:
+        max_samples = total_frames_num
+        if num_frames > 0:  # Hard upper bound
+            max_samples = min(num_frames, max_samples)
+        if fps > 0:  # If fps is provided, use it to limit the number of samples
+            max_samples = min(max_samples, math.floor(duration * fps))
+        max_samples = max(1, max_samples)  # to make sure we have at least one sample
 
-        # Determine target number of samples
-        if num_frames > 0:
-            # Hard upper bound
-            max_samples = int(num_frames)
-        else:
-            # No cap -> sample at desired fps
-            max_samples = int(max(1, math.floor(duration * fps)))
-
-        # Clamp to available frames if count is known
-        max_samples = max(1, min(max_samples, total_frames_num))
 
         # Uniform coverage of the entire timeline within the cap
         # Use linspace over [0, total_frames-1]

From 4646a17237fa4ad0ba30d6794489dbf78092730b Mon Sep 17 00:00:00 2001
From: Natan Bagrov <nbagrov@nvidia.com>
Date: Thu, 16 Oct 2025 01:36:31 -0700
Subject: [PATCH 07/14] ruff

---
 vllm/multimodal/video.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 3be0607b4772..26a8ae399dc5 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -205,7 +205,7 @@ def load_bytes(
             original_fps = 30
 
         duration = total_frames_num / original_fps
-        
+
         # Determine target number of samples:
         max_samples = total_frames_num
         if num_frames > 0:  # Hard upper bound
@@ -214,7 +214,6 @@ def load_bytes(
             max_samples = min(max_samples, math.floor(duration * fps))
         max_samples = max(1, max_samples)  # to make sure we have at least one sample
 
-
         # Uniform coverage of the entire timeline within the cap
         # Use linspace over [0, total_frames-1]
         raw = np.linspace(0, total_frames_num - 1, max_samples, endpoint=True)

From 8da7256dd4e3a9c8652ac67eca3b1e43af2b3052 Mon Sep 17 00:00:00 2001
From: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Date: Thu, 16 Oct 2025 23:30:25 +0300
Subject: [PATCH 08/14] fix: a more robust way to ensure only a single video
 item is processed at a time

Instead of relying on input shapes and that the first dim is the batch
dim, just process a single item. Changes in
group_mm_kwargs_by_modality() which now has merge_by_field_config=True
for almost all relevant models mean that the batch dimension doesn't
exist and frames from different videos are gathered in the same dim.
That's why we need a more robust way to deal with processing just a
single video

Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
---
 vllm/v1/worker/gpu_model_runner.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7e72ce937be4..01aff5d1890e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1740,14 +1740,22 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             # output vision tokens count should be considered)
             curr_group_outputs = []
 
-            if self.is_multimodal_pruning_enabled and modality == "video":
-                micro_batch_size = 1
-                for i in range(0, num_items, micro_batch_size):
-                    micro_batch_mm_inputs = dict(
-                        (k, v[i : i + micro_batch_size])
-                        for k, v in mm_kwargs_group.items()
+            if (
+                self.is_multimodal_pruning_enabled
+                and modality == "video"
+                and num_items > 1
+            ):
+                for video_mm_kwargs_item in filter(
+                    lambda item: item.modality == "video", mm_kwargs
+                ):
+                    _, _, micro_batch_mm_inputs = next(
+                        group_mm_kwargs_by_modality(
+                            [video_mm_kwargs_item],
+                            device=self.device,
+                            pin_memory=self.pin_memory,
+                            merge_by_field_config=model.merge_by_field_config,
+                        )
                     )
-
                     micro_batch_outputs = model.get_multimodal_embeddings(
                         **micro_batch_mm_inputs
                     )

From 9e45ae41fe8aa4ffd961cab36e35efeded719ab9 Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Fri, 17 Oct 2025 09:00:45 +0300
Subject: [PATCH 09/14] Fix wrong dtype (int32) of dummy video input to correct
 one (np.uint8)

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 vllm/multimodal/profiling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 90b19961c6eb..d520be61ddd4 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -223,7 +223,7 @@ def _get_dummy_videos(
                         height,
                     )
                 height = min(height, overrides.height)
-        video = np.full((num_frames, width, height, 3), 255)
+        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
         return [video] * num_videos
 
 

From ae377da1357feacbad250cd26a5d012debeca611 Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Fri, 17 Oct 2025 11:58:25 +0300
Subject: [PATCH 10/14] Remove hard-coded 16 frames used for memory profiling

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 vllm/model_executor/models/nano_nemotron_vl.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 0f5f892fca56..ef08b8992ab3 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -94,7 +94,7 @@
 IMG_CONTEXT = "<image>"
 
 # Profiling
-MAX_FRAMES = 16
+# MAX_FRAMES = 16
 DEFAULT_NUM_TILES = 12
 
 
@@ -820,8 +820,6 @@ def get_num_frames_with_most_features(
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token
         max_frames_per_video = max_total_frames // max(max_videos, 1)
-
-        max_frames_per_video = min(max_frames_per_video, MAX_FRAMES)
         return max(max_frames_per_video, 1)
 
     def get_hf_processor(self, **kwargs: object) -> NanoNemotronVLProcessor:

From f6ffcb47dae6c556658bb616bfee1613a4afe818 Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Fri, 17 Oct 2025 15:14:49 +0300
Subject: [PATCH 11/14] Revert file

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 vllm/model_executor/models/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 8563b21b2898..71abfe98813d 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -440,6 +440,7 @@ def _merge_multimodal_embeddings(
     try:
         # For debugging
         # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype)
+
         # NOTE: This can avoid D2H sync (#22105), but fails to
         # raise an error if is_multimodal.sum() < len(mm_embeds_flat)
         inputs_embeds.masked_scatter_(

From 31dc3e0cb1f8149503a02114eeaf3eca44817203 Mon Sep 17 00:00:00 2001
From: nvnbagrov <nbagrov@nvidia.com>
Date: Fri, 17 Oct 2025 17:12:55 +0300
Subject: [PATCH 12/14] created a new class for video sampling, revert original
 glm behavior (#4)

* created a new class for video sampling, revert original glm behavior

* ruff

* ruff
---
 vllm/multimodal/video.py | 217 +++++++++++++++++++++++++++++----------
 1 file changed, 161 insertions(+), 56 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 26a8ae399dc5..9b620ae0a0b2 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -98,6 +98,58 @@ def get_cv2_video_api(self):
             break
         return api_pref
 
+    @classmethod
+    def _get_frame_indices_to_sample(
+        cls,
+        total_frames_num: int,
+        max_num_frames_to_sample: int,
+        **kwargs,
+    ) -> list[int]:
+        full_read = (
+            max_num_frames_to_sample == -1
+            or total_frames_num < max_num_frames_to_sample
+        )
+        if full_read:
+            frame_idx = list(range(0, total_frames_num))
+        else:
+            uniform_sampled_frames = np.linspace(
+                0, total_frames_num - 1, max_num_frames_to_sample, dtype=int
+            )
+            frame_idx = uniform_sampled_frames.tolist()
+        return frame_idx
+
+    @classmethod
+    def _sample_frames_from_video(
+        cls,
+        cap,
+        frame_indices: list[int],
+        allow_missing_frames: bool = False,
+    ) -> npt.NDArray:
+        import cv2
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
+
+        i = 0
+        for idx in range(max(frame_indices) + 1):
+            ok = cap.grab()
+            if not ok:
+                break
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    i += 1
+
+        if not allow_missing_frames and i != len(frame_indices):
+            raise ValueError(
+                f"Expected reading {len(frame_indices)} frames, "
+                f"but only loaded {i} frames from video."
+            )
+
+        return frames
+
     @classmethod
     def load_bytes(
         cls,
@@ -117,62 +169,140 @@ def load_bytes(
         duration = total_frames_num / original_fps if original_fps > 0 else 0
 
         # resample video to target num_frames
-        full_read = num_frames == -1 or total_frames_num < num_frames
-        if full_read:
-            num_frames = total_frames_num
-            frame_idx = list(range(0, num_frames))
-        else:
-            uniform_sampled_frames = np.linspace(
-                0, total_frames_num - 1, num_frames, dtype=int
+        frame_indices = cls._get_frame_indices_to_sample(total_frames_num, num_frames)
+        num_frames = len(frame_indices)
+        frames = cls._sample_frames_from_video(cap, frame_indices)
+
+        # Use transformers transformers.video_utils.VideoMetadata format
+        # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
+        # can cause incorrect timestamp calculation without num_frames=-1.
+        metadata = {
+            "total_num_frames": num_frames,
+            "fps": num_frames / duration,
+            "duration": duration,
+            "video_backend": "opencv",
+            "frames_indices": frame_indices,
+            # extra field used to control hf processor's video
+            # sampling behavior
+            "do_sample_frames": num_frames == total_frames_num,
+        }
+
+        return frames, metadata
+
+
+@VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
+class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        fps: int = 2,
+        max_duration: int = 300,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
+        # resample video to target num_frames
+        max_frame_idx = total_frames_num - 1
+        duration = duration or round(max_frame_idx / original_fps) + 1
+
+        # Refer to:
+        # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
+        frame_indices: range | list[int]
+        if duration <= max_duration:
+            n = int(math.floor(duration * fps))
+            frame_indices = sorted(
+                {
+                    min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
+                    for i in range(n)
+                }
             )
-            frame_idx = uniform_sampled_frames.tolist()
+        else:
+            num_samples = int(max_duration * fps)
+            if num_samples >= total_frames_num:
+                frame_indices = range(total_frames_num)
+            else:
+                target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
+                frame_indices = sorted(
+                    {
+                        min(max_frame_idx, int(math.ceil(t * original_fps)))
+                        for t in target_seconds
+                    }
+                )
 
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
+        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
 
         i = 0
         for idx in range(total_frames_num):
             ok = cap.grab()
             if not ok:
                 break
-            if idx in frame_idx:
+            if idx in frame_indices:
                 ret, frame = cap.retrieve()
                 if ret:
                     frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     i += 1
 
-        assert i == num_frames, (
-            f"Expected reading {num_frames} frames, "
+        assert i == len(frame_indices), (
+            f"Expected reading {len(frame_indices)} frames, "
             f"but only loaded {i} frames from video."
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
-        # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
-        # can cause incorrect timestamp calculation without num_frames=-1.
         metadata = {
-            "total_num_frames": num_frames,
-            "fps": num_frames / duration,
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
             "duration": duration,
-            "video_backend": "opencv",
-            "frames_indices": list(range(num_frames)),
-            # extra field used to control hf processor's video
-            # sampling behavior
-            "do_sample_frames": num_frames == total_frames_num,
+            "video_backend": "opencv_dynamic",
+            "frames_indices": list(frame_indices),
+            "do_sample_frames": False,
         }
 
         return frames, metadata
 
 
-@VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
-class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
+@VIDEO_LOADER_REGISTRY.register("opencv_nemotron_vl_v2")
+class OpenCVNemotronVideoBackend(OpenCVVideoBackend):
+    @classmethod
+    def _get_frame_indices_to_sample(
+        cls,
+        total_frames_num: int,
+        max_num_frames_to_sample: int,
+        fps: int,
+        duration_seconds: float,
+        **kwargs,
+    ) -> list[int]:
+        # Determine target number of samples:
+        max_samples = total_frames_num
+        if max_num_frames_to_sample > 0:  # Hard upper bound
+            max_samples = min(max_num_frames_to_sample, max_samples)
+        if fps > 0:  # If fps is provided, use it to limit the number of samples
+            max_samples = min(max_samples, math.floor(duration_seconds * fps))
+        max_samples = max(1, max_samples)  # to make sure we have at least one sample
+
+        # Uniform coverage of the entire timeline within the cap
+        # Use linspace over [0, total_frames-1]
+        raw = np.linspace(0, total_frames_num - 1, max_samples, endpoint=True)
+        return np.unique(raw.round().astype(int)).tolist()
+
     @classmethod
     def load_bytes(
         cls,
         data: bytes,
         num_frames: int = -1,
         fps: int = -1,
-        max_duration: int = 300,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
@@ -206,18 +336,9 @@ def load_bytes(
 
         duration = total_frames_num / original_fps
 
-        # Determine target number of samples:
-        max_samples = total_frames_num
-        if num_frames > 0:  # Hard upper bound
-            max_samples = min(num_frames, max_samples)
-        if fps > 0:  # If fps is provided, use it to limit the number of samples
-            max_samples = min(max_samples, math.floor(duration * fps))
-        max_samples = max(1, max_samples)  # to make sure we have at least one sample
-
-        # Uniform coverage of the entire timeline within the cap
-        # Use linspace over [0, total_frames-1]
-        raw = np.linspace(0, total_frames_num - 1, max_samples, endpoint=True)
-        frame_indices = np.unique(raw.round().astype(int)).tolist()
+        frame_indices = cls._get_frame_indices_to_sample(
+            total_frames_num, num_frames, fps, duration
+        )
 
         effective_fps = len(frame_indices) / duration
         print(
@@ -227,24 +348,8 @@ def load_bytes(
             f"at {effective_fps:.2f}fps."
         )
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(total_frames_num):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_indices:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == len(frame_indices), (
-            f"Expected reading {len(frame_indices)} frames, "
-            f"but only loaded {i} frames from video."
+        frames = cls._sample_frames_from_video(
+            cap, frame_indices, allow_missing_frames=True
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -252,8 +357,8 @@ def load_bytes(
             "total_num_frames": total_frames_num,
             "fps": original_fps,
             "duration": duration,
-            "video_backend": "opencv_dynamic",
-            "frames_indices": list(frame_indices),
+            "video_backend": "opencv_nemotron_vl_v2",
+            "frames_indices": frame_indices,
             "do_sample_frames": False,
         }
 

From 4e70952edaaebc7dbd1f58d4ed714dce6be49206 Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Fri, 17 Oct 2025 17:20:57 +0300
Subject: [PATCH 13/14] Rollback OpenCVVideoBackend logic

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 vllm/multimodal/video.py | 116 ++++++++++++++++++++-------------------
 1 file changed, 61 insertions(+), 55 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 9b620ae0a0b2..2c70002a1af0 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -99,79 +99,53 @@ def get_cv2_video_api(self):
         return api_pref
 
     @classmethod
-    def _get_frame_indices_to_sample(
+    def load_bytes(
         cls,
-        total_frames_num: int,
-        max_num_frames_to_sample: int,
+        data: bytes,
+        num_frames: int = -1,
         **kwargs,
-    ) -> list[int]:
-        full_read = (
-            max_num_frames_to_sample == -1
-            or total_frames_num < max_num_frames_to_sample
-        )
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
+        # resample video to target num_frames
+        full_read = num_frames == -1 or total_frames_num < num_frames
         if full_read:
-            frame_idx = list(range(0, total_frames_num))
+            num_frames = total_frames_num
+            frame_idx = list(range(0, num_frames))
         else:
             uniform_sampled_frames = np.linspace(
-                0, total_frames_num - 1, max_num_frames_to_sample, dtype=int
+                0, total_frames_num - 1, num_frames, dtype=int
             )
             frame_idx = uniform_sampled_frames.tolist()
-        return frame_idx
-
-    @classmethod
-    def _sample_frames_from_video(
-        cls,
-        cap,
-        frame_indices: list[int],
-        allow_missing_frames: bool = False,
-    ) -> npt.NDArray:
-        import cv2
 
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
+        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
 
         i = 0
-        for idx in range(max(frame_indices) + 1):
+        for idx in range(total_frames_num):
             ok = cap.grab()
             if not ok:
                 break
-            if idx in frame_indices:
+            if idx in frame_idx:
                 ret, frame = cap.retrieve()
                 if ret:
                     frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     i += 1
 
-        if not allow_missing_frames and i != len(frame_indices):
-            raise ValueError(
-                f"Expected reading {len(frame_indices)} frames, "
-                f"but only loaded {i} frames from video."
-            )
-
-        return frames
-
-    @classmethod
-    def load_bytes(
-        cls,
-        data: bytes,
-        num_frames: int = -1,
-        **kwargs,
-    ) -> tuple[npt.NDArray, dict[str, Any]]:
-        import cv2
-
-        backend = cls().get_cv2_video_api()
-        cap = cv2.VideoCapture(BytesIO(data), backend, [])
-        if not cap.isOpened():
-            raise ValueError("Could not open video stream")
-
-        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        original_fps = cap.get(cv2.CAP_PROP_FPS)
-        duration = total_frames_num / original_fps if original_fps > 0 else 0
-
-        # resample video to target num_frames
-        frame_indices = cls._get_frame_indices_to_sample(total_frames_num, num_frames)
-        num_frames = len(frame_indices)
-        frames = cls._sample_frames_from_video(cap, frame_indices)
+        assert i == num_frames, (
+            f"Expected reading {num_frames} frames, "
+            f"but only loaded {i} frames from video."
+        )
 
         # Use transformers transformers.video_utils.VideoMetadata format
         # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
@@ -181,7 +155,7 @@ def load_bytes(
             "fps": num_frames / duration,
             "duration": duration,
             "video_backend": "opencv",
-            "frames_indices": frame_indices,
+            "frames_indices": list(range(num_frames)),
             # extra field used to control hf processor's video
             # sampling behavior
             "do_sample_frames": num_frames == total_frames_num,
@@ -297,6 +271,38 @@ def _get_frame_indices_to_sample(
         raw = np.linspace(0, total_frames_num - 1, max_samples, endpoint=True)
         return np.unique(raw.round().astype(int)).tolist()
 
+    @classmethod
+    def _sample_frames_from_video(
+        cls,
+        cap,
+        frame_indices: list[int],
+        allow_missing_frames: bool = False,
+    ) -> tuple[npt.NDArray, list[int]]:
+        import cv2
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.full((len(frame_indices), height, width, 3), dtype=np.uint8)
+
+        i = 0
+        for idx in range(max(frame_indices) + 1):
+            ok = cap.grab()
+            if not ok:
+                break
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    i += 1
+
+        if not allow_missing_frames and i != len(frame_indices):
+            raise ValueError(
+                f"Expected reading {len(frame_indices)} frames, "
+                f"but only loaded {i} frames from video."
+            )
+
+        return frames[:i], frame_indices[:i]
+
     @classmethod
     def load_bytes(
         cls,
@@ -348,7 +354,7 @@ def load_bytes(
             f"at {effective_fps:.2f}fps."
         )
 
-        frames = cls._sample_frames_from_video(
+        frames, frame_indices = cls._sample_frames_from_video(
             cap, frame_indices, allow_missing_frames=True
         )
 

From e66ba3b38f83b1da3bdf67df2ec3c7c74262064b Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Fri, 17 Oct 2025 17:22:11 +0300
Subject: [PATCH 14/14] Cosmetics

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 vllm/v1/worker/gpu_model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 01aff5d1890e..a926c080bb60 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1756,6 +1756,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
                             merge_by_field_config=model.merge_by_field_config,
                         )
                     )
+
                     micro_batch_outputs = model.get_multimodal_embeddings(
                         **micro_batch_mm_inputs
                     )