diff --git a/benchmark/mmmu/bench_hf.py b/benchmark/mmmu/bench_hf.py
index 949b63b802a7..c841f44466d7 100644
--- a/benchmark/mmmu/bench_hf.py
+++ b/benchmark/mmmu/bench_hf.py
@@ -36,9 +36,10 @@ def eval_mmmu(args):
         try:
             # check if the model is belongs to internvl
             if "InternVL" in args.model_path:
-                from internvl_utils import load_image
                 from transformers import AutoTokenizer
 
+                from sglang.srt.multimodal.internvl_utils import image_to_pixel_values
+
                 tokenizer = AutoTokenizer.from_pretrained(args.model_path)
                 model = AutoModel.from_pretrained(
                     args.model_path,
@@ -80,7 +81,11 @@ def eval_mmmu(args):
         assert image is not None
 
         if "InternVL" in args.model_path:
-            pixel_values = load_image(sample["image_path"]).to(torch.bfloat16).cuda()
+            image = PIL.Image.open(sample["image_path"]).convert("RGB")
+            pixel_values = image_to_pixel_values(
+                image, input_size=448, max_num=12, use_thumbnail=True
+            )
+            pixel_values = pixel_values.to(device="cuda", dtype=torch.bfloat16)
             contents = ""
             if prefix:
                 contents += prefix
diff --git a/docs/supported_models/multimodal_language_models.md b/docs/supported_models/multimodal_language_models.md
index 3414d6c48d3a..90c877518de0 100644
--- a/docs/supported_models/multimodal_language_models.md
+++ b/docs/supported_models/multimodal_language_models.md
@@ -45,6 +45,7 @@ in the GitHub search bar.
 | **DotsVLM** (General/OCR)  | `rednote-hilab/dots.vlm1.inst`             | RedNote's vision-language model built on a 1.2B vision encoder and DeepSeek V3 LLM, featuring NaViT vision encoder trained from scratch with dynamic resolution support and enhanced OCR capabilities through structured image data training. |  |
 | **DotsVLM-OCR**            | `rednote-hilab/dots.ocr`                   | Specialized OCR variant of DotsVLM optimized for optical character recognition tasks with enhanced text extraction and document understanding capabilities. | Don't use `--trust-remote-code` |
 | **NVILA** (8B, 15B, Lite-2B, Lite-8B, Lite-15B) | `Efficient-Large-Model/NVILA-8B` | `chatml` | NVILA explores the full stack efficiency of multi-modal design, achieving cheaper training, faster deployment and better performance. |
+| **NVIDIA Nemotron Nano 2.0 VL** | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` | NVIDIA Nemotron Nano v2 VL enables multi-image reasoning and video understanding, along with strong document intelligence, visual Q&A and summarization capabilities. It builds on Nemotron Nano V2, a hybrid Mamba-Transformer LLM, in order to achieve higher inference throughput in long document and video scenarios. | Use `--trust-remote-code`. You may need to adjust `--max-mamba-cache-size` [default is 512] to fit memory constraints. |
 | **JetVLM** |  | JetVLM is an vision-language model designed for high-performance multimodal understanding and generation tasks built upon Jet-Nemotron. | Coming soon |
 
 ## Video Input Support
@@ -57,6 +58,7 @@ SGLang supports video input for Vision-Language Models (VLMs), enabling temporal
 | **GLM-4v** (4.5V, 4.1V, MOE) | `zai-org/GLM-4.5V` | Video clips are read with Decord, converted to tensors, and passed to the model alongside metadata for rotary-position handling. |
 | **NVILA** (Full & Lite) | `Efficient-Large-Model/NVILA-8B` | The runtime samples eight frames per clip and attaches them to the multimodal request when `video_data` is present. |
 | **LLaVA video variants** (LLaVA-NeXT-Video, LLaVA-OneVision) | `lmms-lab/LLaVA-NeXT-Video-7B` | The processor routes video prompts to the LlavaVid video-enabled architecture, and the provided example shows how to query it with `sgl.video(...)` clips. |
+| **NVIDIA Nemotron Nano 2.0 VL** | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` | For video, the processor is configured to sample at 2 FPS, at a max of 128 frames, as per model training. |
 | **JetVLM** |  | The runtime samples eight frames per clip and attaches them to the multimodal request when `video_data` is present. |
 
 Use `sgl.video(path, num_frames)` when building prompts to attach clips from your SGLang programs.
diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py
index 623d91d7352f..b35cc1dc5f23 100644
--- a/python/sglang/srt/configs/__init__.py
+++ b/python/sglang/srt/configs/__init__.py
@@ -12,6 +12,7 @@
 from sglang.srt.configs.kimi_vl import KimiVLConfig
 from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
 from sglang.srt.configs.longcat_flash import LongcatFlashConfig
+from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config
 from sglang.srt.configs.nemotron_h import NemotronHConfig
 from sglang.srt.configs.olmo3 import Olmo3Config
 from sglang.srt.configs.qwen3_next import Qwen3NextConfig
@@ -40,6 +41,7 @@
     "DotsOCRConfig",
     "FalconH1Config",
     "NemotronHConfig",
+    "NemotronH_Nano_VL_V2_Config",
     "JetNemotronConfig",
     "JetVLMConfig",
 ]
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 446533f53fb3..7f1f3e472c6f 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -938,6 +938,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
     "Mistral3ForConditionalGeneration",
     "MultiModalityCausalLM",
     "MllamaForConditionalGeneration",
+    "NemotronH_Nano_VL_V2",
     "Qwen2AudioForConditionalGeneration",
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
diff --git a/python/sglang/srt/configs/nano_nemotron_vl.py b/python/sglang/srt/configs/nano_nemotron_vl.py
new file mode 100644
index 000000000000..09ab29abf465
--- /dev/null
+++ b/python/sglang/srt/configs/nano_nemotron_vl.py
@@ -0,0 +1,114 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/blob/cb5a65ff10232128389d882d805fa609427544f1/configuration.py
+
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.configs.nemotron_h import NemotronHConfig
+from sglang.srt.configs.radio import RadioConfig
+from sglang.srt.multimodal.internvl_utils import IMAGENET_MEAN, IMAGENET_STD
+
+
+def float_triplet(seq: Any):
+    a, b, c = tuple(seq)
+    assert (
+        isinstance(a, float) and isinstance(b, float) and isinstance(c, float)
+    ), "expected three floats"
+    return a, b, c
+
+
+class NemotronH_Nano_VL_V2_Config(PretrainedConfig):
+    model_type = "NemotronH_Nano_VL_V2"
+    is_composition = True
+
+    def __init__(
+        self,
+        vision_config=None,
+        llm_config=None,
+        force_image_size: int = 512,
+        patch_size: int = 16,
+        downsample_ratio=0.5,
+        template=None,
+        ps_version="v2",
+        image_tag_type="internvl",
+        projector_hidden_size=4096,
+        vit_hidden_size=1280,
+        video_pruning_rate: float = 0.0,
+        video_context_token: str = "<video>",
+        img_context_token: str = "<image>",
+        img_start_token: str = "<img>",
+        img_end_token: str = "</img>",
+        norm_mean: tuple[float, float, float] | list[float] = IMAGENET_MEAN,
+        norm_std: tuple[float, float, float] | list[float] = IMAGENET_STD,
+        use_thumbnail: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Handle both cases: when loading from JSON (llm_config is dict) and when called internally by transformers (llm_config; vision_config are None)
+        if llm_config is not None:
+            self.llm_config = NemotronHConfig(**llm_config)
+            assert isinstance(vision_config, dict), "vision_config must be a dictionary"
+            self.raw_vision_config = vision_config
+        else:
+            assert vision_config is None
+            self.llm_config = NemotronHConfig()
+            self.raw_vision_config = {}
+
+        # Assign configuration values
+        vision_image_size = self.raw_vision_config.get("image_size", force_image_size)
+        vision_patch_size = self.raw_vision_config.get("patch_size", patch_size)
+        self.image_size = int(
+            vision_image_size[0]
+            if isinstance(vision_image_size, list)
+            else vision_image_size
+        )
+        self.patch_size = int(
+            vision_patch_size[0]
+            if isinstance(vision_patch_size, list)
+            else vision_patch_size
+        )
+
+        self.downsample_ratio = downsample_ratio
+        self.video_context_token = video_context_token
+        self.img_context_token = img_context_token
+        self.template = template  # TODO move out of here and into the tokenizer
+        self.ps_version = ps_version  # Pixel shuffle version
+        self.image_tag_type = image_tag_type  # TODO: into the tokenizer too?
+        self.projector_hidden_size = projector_hidden_size
+        self.vit_hidden_size = vit_hidden_size
+        self.video_pruning_rate = video_pruning_rate
+
+        self.norm_mean = float_triplet(norm_mean)
+        self.norm_std = float_triplet(norm_std)
+        self.use_thumbnail = use_thumbnail
+        self.img_start_token = img_start_token
+        self.img_end_token = img_end_token
+
+    def create_radio_config(self):
+        config = self.raw_vision_config
+        model_name = config["args"]["model"]
+        reg_tokens = config["args"].get("register_multiple")
+        image_size = config.get("preferred_resolution", [224])[0]
+        radio_config = RadioConfig(
+            patch_size=self.patch_size,
+            norm_mean=self.norm_mean,
+            norm_std=self.norm_std,
+            model_name=model_name,
+            reg_tokens=reg_tokens,
+            image_size=image_size,
+        )
+        return radio_config
diff --git a/python/sglang/srt/configs/radio.py b/python/sglang/srt/configs/radio.py
new file mode 100644
index 000000000000..cc6df58e0ff2
--- /dev/null
+++ b/python/sglang/srt/configs/radio.py
@@ -0,0 +1,106 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/radio.py
+
+"""Radio vision model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
+    "vit_small_patch16_224": (384, 12, 6, 1536),
+    "vit_base_patch16_224": (768, 12, 12, 3072),
+    "vit_large_patch16_224": (1024, 24, 16, 4096),
+    "vit_huge_patch16_224": (1280, 32, 16, 5120),
+}
+
+OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+class RadioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a Radio
+    vision model. It is used to instantiate a Radio model according to the
+    specified arguments, defining the model architecture.
+
+    Args:
+        model_name: Name of the vision transformer model
+            (e.g., "vit_base_patch16_224"). Used to determine architecture
+            dimensions from `VIT_TIMM_DIM_BY_NAME`.
+        image_size: The size (resolution) of each image.
+        patch_size: The size (resolution) of each patch.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        qk_normalization: Whether to apply normalization to queries and keys.
+        norm_type: The normalization type to use.
+        layer_norm_eps: The epsilon used by the layer normalization layers.
+        initializer_factor: A factor for initializing all weight matrices.
+        hidden_act: The non-linear activation function in the encoder.
+        max_img_size: Maximum image size for position embeddings.
+        norm_mean: Mean values for image normalization (RGB channels).
+            Defaults to (0.48145466, 0.4578275, 0.40821073)).
+        norm_std: Standard deviation values for image normalization
+            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
+        reg_tokens: Number of register tokens to use.
+    """
+
+    model_type = "radio"
+
+    def __init__(
+        self,
+        model_name: str,
+        image_size: int = 224,
+        patch_size: int = 16,
+        qkv_bias: bool = True,
+        qk_normalization: bool = False,
+        norm_type: str = "layer_norm",
+        layer_norm_eps: float = 1e-6,
+        initializer_factor: float = 1.0,
+        hidden_act: str = "gelu",
+        max_img_size: int = 2048,
+        norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
+        norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
+        reg_tokens: int | None = None,
+        drop_path_rate: float = 0.0,
+        dropout: float = 0.0,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        (
+            self.hidden_size,
+            self.num_hidden_layers,
+            self.num_attention_heads,
+            self.intermediate_size,
+        ) = VIT_TIMM_DIM_BY_NAME[model_name]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.norm_type = norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+        self.max_img_size = max_img_size
+        self.norm_mean = (
+            list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
+        )
+        self.norm_std = (
+            list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
+        )
+        self.reg_tokens = reg_tokens
+        self.drop_path_rate = drop_path_rate
+        self.dropout = dropout
+        super().__init__(**kwargs)
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index e62f0774673b..f3b93c375ed1 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -34,6 +34,7 @@
     JetNemotronConfig,
     JetVLMConfig,
     KimiLinearConfig,
+    NemotronH_Nano_VL_V2_Config,
     NemotronHConfig,
     Qwen3NextConfig,
 )
@@ -1474,6 +1475,8 @@ def mamba2_config(self):
         config = self.model_config.hf_config
         if isinstance(config, FalconH1Config | NemotronHConfig):
             return config
+        if isinstance(config, NemotronH_Nano_VL_V2_Config):
+            return config.llm_config
         return None
 
     @property
diff --git a/python/sglang/srt/models/nano_nemotron_vl.py b/python/sglang/srt/models/nano_nemotron_vl.py
new file mode 100644
index 000000000000..e337672172b1
--- /dev/null
+++ b/python/sglang/srt/models/nano_nemotron_vl.py
@@ -0,0 +1,219 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/nano_nemotron_vl.py
+
+import logging
+from typing import Iterable
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config
+from sglang.srt.layers.activation import ReLU2
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.nemotron_h import NemotronHForCausalLM
+from sglang.srt.models.radio import RadioModel
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class NemotronH_Nano_VL_V2(nn.Module):
+    def __init__(
+        self,
+        config: NemotronH_Nano_VL_V2_Config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.downsample_ratio = config.downsample_ratio
+        self.language_model = NemotronHForCausalLM(
+            config=config.llm_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.vision_model = RadioModel(config=config.create_radio_config()).to(
+            self.language_model.config.dtype
+        )
+
+        vit_hidden_size = config.vit_hidden_size
+        self.rmsnorm_hidden_size = vit_hidden_size * int(1 / self.downsample_ratio) ** 2
+        vision_projection_hidden_size = config.projector_hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            RMSNorm(
+                hidden_size=self.rmsnorm_hidden_size,
+                eps=1e-5,
+            ),
+            nn.Linear(
+                self.rmsnorm_hidden_size,
+                vision_projection_hidden_size,
+                bias=False,
+            ),
+            ReLU2(),
+            nn.Linear(vision_projection_hidden_size, llm_hidden_size, bias=False),
+        ).to(self.language_model.config.torch_dtype)
+        self.config = config
+
+    def pad_input_ids(self, input_ids: list[int], mm_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = mm_inputs.im_start_id
+        im_end_id: int = mm_inputs.im_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id)]
+        helper = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return helper.pad_input_tokens(input_ids, mm_inputs)
+
+    def pixel_shuffle(self, x: torch.Tensor, scale_factor: float = 0.5) -> torch.Tensor:
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(
+            n,
+            w,
+            int(h * scale_factor),
+            int(c / scale_factor),
+        )
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale -->
+        # N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.config.ps_version != "v1":
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def extract_feature(self, pixel_values):
+        # Process images in a micro-batch of at most 128 frames per call
+        # This is done on purpose to ensure peak GPU ram usage of huge batch
+        # (namely for really long videos with EVS ON) won't cause any problems
+        # as we don't support chunked prefill for video media
+        micro_batch_size = 128
+        n = pixel_values.shape[0]
+        vit_embeds_list = []
+        for i in range(0, n, micro_batch_size):
+            vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
+            vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
+            h = w = int(vit_embeds.shape[1] ** 0.5)
+            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+            vit_embeds = self.pixel_shuffle(
+                vit_embeds, scale_factor=self.downsample_ratio
+            )
+            vit_embeds = vit_embeds.view(-1, self.rmsnorm_hidden_size)
+            vit_embeds = self.mlp1(vit_embeds)
+            vit_embeds = vit_embeds.view(n, -1, self.rmsnorm_hidden_size)
+            vit_embeds_list.append(vit_embeds)
+        vit_embeds = torch.cat(vit_embeds_list, dim=0)
+        return vit_embeds
+
+    def get_image_feature(self, items: list[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        pixel_values = torch.cat([item.feature for item in items])
+        image_features = self.extract_feature(pixel_values)
+        return image_features
+
+    def get_video_feature(self, items: list[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the video model into language model space.
+
+        Returns:
+            video_features (`torch.Tensor`): Video feature tensor of shape `(num_videos, video_length, embed_dim)`).
+        """
+        pixel_values = torch.cat([item.feature for item in items])
+        video_features = self.extract_feature(pixel_values)
+        return video_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            multimodal_model=self,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.VIDEO: self.get_video_feature,
+            },
+            positions=positions,
+        )
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        adapter_dict = dict(self.mlp1.named_parameters())
+
+        def is_llm(name: str) -> bool:
+            return name.startswith("language_model")
+
+        def is_adapter_weights(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("mlp1")
+
+        def is_vision_weights(name: str) -> bool:
+            return name.startswith("vision_model.radio_model.")
+
+        # Separate weights by component
+        llm_weights = []
+        vision_weights = []
+
+        for name, w in weights:
+            if is_llm(name):
+                # Strip 'language_model.' prefix for LLM weights
+                llm_weights.append((".".join(name.split(".")[1:]), w))
+            elif is_adapter_weights((name, w)):
+                # Load vision-language adapter weights directly
+                trimmed_name = ".".join(name.split(".")[1:])
+                param = adapter_dict[trimmed_name]
+                with torch.no_grad():
+                    default_weight_loader(param, w)
+            elif is_vision_weights(name):
+                # Convert: vision_model.radio_model.* → radio_model.*
+                hf_key = name[len("vision_model.") :]  # Remove "vision_model." prefix
+                vision_weights.append((hf_key, w))
+        self.language_model.load_weights(llm_weights)
+        self.vision_model.load_weights(vision_weights)
+
+
+EntryClass = [NemotronH_Nano_VL_V2]
diff --git a/python/sglang/srt/models/nemotron_h.py b/python/sglang/srt/models/nemotron_h.py
index 8a2075ea9a44..85a0c24b37a6 100644
--- a/python/sglang/srt/models/nemotron_h.py
+++ b/python/sglang/srt/models/nemotron_h.py
@@ -542,9 +542,6 @@ def get_layer(idx: int, prefix: str):
         )
         self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -557,7 +554,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_tokens(input_ids)
             residual = None
         else:
             assert pp_proxy_tensors is not None
@@ -641,8 +638,8 @@ def _init_model(
             config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def get_input_embeddings(self) -> VocabParallelEmbedding:
+        return self.model.embed_tokens
 
     @torch.no_grad()
     def forward(
diff --git a/python/sglang/srt/models/radio.py b/python/sglang/srt/models/radio.py
new file mode 100644
index 000000000000..2cd233141c15
--- /dev/null
+++ b/python/sglang/srt/models/radio.py
@@ -0,0 +1,532 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/radio.py
+
+import math
+from collections.abc import Iterable
+from itertools import repeat
+from typing import TypeAlias
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutput
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    replace_prefix,
+    replace_substrings,
+)
+from sglang.srt.models.internvl import InternVisionEncoder
+
+input_dim_t: TypeAlias = int | tuple[int, int]
+norm_t: TypeAlias = tuple[float, float, float] | torch.Tensor
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+class ClsToken(nn.Module):
+    def __init__(
+        self,
+        ndim: int,
+        num_tokens: int = 1,
+        enabled: bool = True,
+        register_multiple: int | None = None,
+        num_registers: int | None = None,
+    ):
+        super().__init__()
+
+        self.ndim = ndim
+        self.enabled = enabled
+        self.num_registers = 0
+        self.num_tokens = num_tokens
+        if enabled:
+            if num_registers:
+                self.num_registers = num_registers
+            elif register_multiple:
+                self.num_registers = register_multiple - (
+                    num_tokens % register_multiple
+                )
+
+            scale = ndim**-0.5
+            self.token = nn.Parameter(
+                torch.randn(num_tokens + self.num_registers, ndim) * scale
+            )
+
+        else:
+            self.token = None
+
+        self.num_patches = self.num_tokens + self.num_registers
+
+    def forward(self, x: torch.Tensor):
+        if self.token is None:
+            return x
+
+        token = self.token.unsqueeze(0).expand(x.shape[0], -1, -1)
+        x = torch.cat(
+            [
+                token,
+                x,
+            ],
+            dim=1,
+        )
+
+        return x
+
+
+class ViTPatchGenerator(nn.Module):
+    def __init__(
+        self,
+        #  config: PretrainedConfig,
+        patch_size: int,
+        embed_dim: int,
+        input_dims: input_dim_t,
+        abs_pos: bool = True,
+        normalize_patches: bool = False,
+        cls_token: bool = False,
+        max_input_dims: input_dim_t | None = None,
+        pos_dropout: float = 0.0,
+        return_pos_enc: bool = False,
+        num_cls_tokens: int = 1,
+        register_multiple: int | None = None,
+        num_registers: int | None = None,
+        patch_bias: bool = False,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        if isinstance(input_dims, int):
+            input_dims = (input_dims, input_dims)
+
+        if max_input_dims is None:
+            max_input_dims = input_dims
+        if isinstance(max_input_dims, int):
+            max_input_dims = (max_input_dims, max_input_dims)
+
+        max_input_dims = tuple(
+            int(math.ceil(d / patch_size) * patch_size) for d in max_input_dims
+        )
+
+        self.cpe_mode = max_input_dims != input_dims
+        self.pos_dropout = pos_dropout
+        self.return_pos_enc = return_pos_enc
+
+        factory = dict(device=device, dtype=dtype)
+
+        self.patch_size = patch_size
+        self.abs_pos = abs_pos
+        self.embed_dim = embed_dim
+
+        self.num_rows = max_input_dims[0] // patch_size
+        self.num_cols = max_input_dims[1] // patch_size
+        self.input_dims = tuple(d // patch_size for d in input_dims)
+        self.num_patches = self.num_rows * self.num_cols
+        self.max_input_dims = max_input_dims
+
+        self.im_to_patches = Im2Patches(patch_size)
+        self.embedder = ViTPatchLinear(
+            patch_size, embed_dim, bias=patch_bias, **factory
+        )
+
+        if abs_pos:
+            scale = embed_dim**-0.5
+            self.pos_embed = nn.Parameter(
+                torch.randn(1, self.num_patches, embed_dim, **factory) * scale
+            )
+
+        self.cls_token = ClsToken(
+            embed_dim,
+            num_tokens=num_cls_tokens,
+            enabled=cls_token,
+            register_multiple=register_multiple,
+            num_registers=num_registers,
+        )
+
+        self.patch_normalizer = (
+            nn.LayerNorm(embed_dim) if normalize_patches else nn.Identity()
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        patches = self.embed_patches(x)
+        patches, pos_enc = self.apply_pos_enc(patches, input_size=x.shape[2:])
+        patches = self.cls_token(patches)
+        patches = self.patch_normalizer(patches)
+        if self.return_pos_enc:
+            return patches, pos_enc
+        return patches
+
+    @property
+    def apply_cls_token(self):
+        return self.cls_token.enabled
+
+    @property
+    def num_cls_tokens(self):
+        return self.cls_token.num_tokens
+
+    @property
+    def num_cls_patches(self):
+        return self.cls_token.num_patches
+
+    @property
+    def num_registers(self):
+        return self.cls_token.num_registers
+
+    @property
+    def num_skip(self):
+        return self.num_cls_tokens + self.num_registers
+
+    def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
+        if src_embed.shape != targ_embed.shape:
+            src_size = int(math.sqrt(src_embed.shape[1]))
+
+            assert (
+                src_size**2 == src_embed.shape[1]
+            ), "Unable to interpolate non-square embedding"
+
+            src_embed = rearrange(
+                src_embed, "b (h w) c -> b c h w", h=src_size, w=src_size
+            )
+            src_embed = F.interpolate(
+                src_embed,
+                size=(self.num_rows, self.num_cols),
+                mode="bicubic",
+                align_corners=True,
+                antialias=False,
+            )
+            src_embed = rearrange(src_embed, "b c h w -> b (h w) c")
+        targ_embed.data.copy_(src_embed)
+
+    def _load_projection(
+        self, src_proj_weight: torch.Tensor, targ_proj_weight: torch.Tensor
+    ):
+        if src_proj_weight.shape != targ_proj_weight.shape:
+            src_patch_size = int(math.sqrt(src_proj_weight.shape[1] // 3))
+
+            assert (src_patch_size**2) * 3 == src_proj_weight.shape[
+                1
+            ], "Unable to interpolate non-square patch size"
+
+            src_proj_weight = rearrange(
+                src_proj_weight,
+                "b (c h w) -> b c h w",
+                c=3,
+                h=src_patch_size,
+                w=src_patch_size,
+            )
+            src_proj_weight = F.interpolate(
+                src_proj_weight,
+                size=(self.patch_size, self.patch_size),
+                mode="bicubic",
+                align_corners=True,
+                antialias=False,
+            )
+            src_proj_weight = rearrange(src_proj_weight, "b c h w -> b (c h w)")
+        targ_proj_weight.data.copy_(src_proj_weight)
+
+    def embed_patches(self, x: torch.Tensor) -> torch.Tensor:
+        patches = self.im_to_patches(x)
+        patches = self.embedder(patches)
+        return patches
+
+    def apply_pos_enc(
+        self,
+        patches: torch.Tensor,
+        patch_idxs: torch.Tensor | None = None,
+        input_size: tuple[int, int] | None = None,
+    ) -> torch.Tensor:
+        if not self.abs_pos:
+            return patches
+
+        pos_enc = self.get_pos_enc(patches.shape[0], patch_idxs, input_size)
+
+        if self.training and self.pos_dropout > 0:
+            keeps = (
+                torch.rand(
+                    patches.shape[0], 1, 1, dtype=pos_enc.dtype, device=pos_enc.device
+                )
+                > self.pos_dropout
+            )
+            pos_enc_drop = torch.where(keeps, pos_enc, 0)
+        else:
+            pos_enc_drop = pos_enc
+
+        return patches + pos_enc_drop, pos_enc
+
+    def get_pos_enc(
+        self,
+        batch_size: int,
+        patch_idxs: torch.Tensor | None = None,
+        input_size: tuple[int, int] | None = None,
+    ) -> torch.Tensor:
+        if input_size is None:
+            input_dims = self.input_dims
+        else:
+            input_dims = tuple(d // self.patch_size for d in input_size)
+
+        pos_embed = self._get_pos_embeddings(batch_size, input_dims)
+
+        if patch_idxs is None:
+            return pos_embed
+
+        exp_patch_idxs = patch_idxs.unsqueeze(-1).expand(-1, -1, pos_embed.shape[-1])
+
+        pos_embed = torch.gather(
+            pos_embed.expand(patch_idxs.shape[0], -1, -1), dim=1, index=exp_patch_idxs
+        )
+        return pos_embed
+
+    def _get_pos_embeddings(self, batch_size: int, input_dims: tuple[int, int]):
+        if (self.num_rows, self.num_cols) == input_dims:
+            return self.pos_embed
+
+        pos_embed = self.pos_embed.reshape(1, self.num_rows, self.num_cols, -1).permute(
+            0, 3, 1, 2
+        )
+
+        def window_select(pos_embed):
+            if input_dims[0] < pos_embed.shape[-2]:
+                pos_embed = pos_embed[..., : input_dims[0], :]
+            if input_dims[1] < pos_embed.shape[-1]:
+                pos_embed = pos_embed[..., :, : input_dims[1]]
+            return pos_embed
+
+        if self.cpe_mode:
+            if self.training:
+                min_scale = math.sqrt(0.1)
+                scale = (
+                    torch.rand(batch_size, 1, 1, device=pos_embed.device)
+                    * (1 - min_scale)
+                    + min_scale
+                )
+                aspect_min = math.log(3 / 4)
+                aspect_max = -aspect_min
+                aspect = torch.exp(
+                    torch.rand(batch_size, 1, 1, device=pos_embed.device)
+                    * (aspect_max - aspect_min)
+                    + aspect_min
+                )
+
+                scale_x = scale * aspect
+                scale_y = scale * (1 / aspect)
+                scale_xy = torch.stack([scale_x, scale_y], dim=-1).clamp_(0, 1)
+
+                pos_xy = torch.rand(batch_size, 1, 1, 2, device=pos_embed.device) * (
+                    1 - scale_xy
+                )
+
+                lin_x = torch.linspace(
+                    0, 1, steps=input_dims[1], device=pos_embed.device
+                )[None, None].expand(batch_size, input_dims[0], -1)
+                lin_y = torch.linspace(
+                    0, 1, steps=input_dims[0], device=pos_embed.device
+                )[None, :, None].expand(batch_size, -1, input_dims[1])
+
+                lin_xy = torch.stack([lin_x, lin_y], dim=-1)
+
+                grid_xy = lin_xy * scale_xy + pos_xy
+
+                # Convert to [-1, 1] range
+                grid_xy.mul_(2).sub_(1)
+
+                pos_embed = F.grid_sample(
+                    pos_embed.float().expand(batch_size, -1, -1, -1),
+                    grid=grid_xy,
+                    mode="bilinear",
+                    padding_mode="zeros",
+                    align_corners=True,
+                ).to(pos_embed.dtype)
+            else:
+                max_dim = max(input_dims)
+                pos_embed = F.interpolate(
+                    pos_embed.float(),
+                    size=(max_dim, max_dim),
+                    align_corners=True,
+                    mode="bilinear",
+                ).to(pos_embed.dtype)
+
+                pos_embed = window_select(pos_embed)
+        else:
+            pos_embed = window_select(pos_embed)
+
+        if pos_embed.shape[-2:] != input_dims:
+            pos_embed = F.interpolate(
+                pos_embed.float(), size=input_dims, align_corners=True, mode="bilinear"
+            ).to(pos_embed.dtype)
+
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+
+        return pos_embed
+
+
+class Im2Patches(nn.Module):
+    def __init__(self, patch_size: int):
+        super().__init__()
+        self.patch_size = patch_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.patch_size == 1:
+            patches = x.flatten(2)
+            patches = patches.permute(0, 2, 1)
+            return patches
+
+        py = x.shape[-2] // self.patch_size
+        px = x.shape[-1] // self.patch_size
+        patches = rearrange(
+            x,
+            "b c (py yy) (px xx) -> b (py px) (c yy xx)",
+            py=py,
+            yy=self.patch_size,
+            px=px,
+            xx=self.patch_size,
+        )
+        return patches
+
+
+class ViTPatchLinear(nn.Linear):
+    def __init__(self, patch_size: int, embed_dim: int, bias: bool = False, **factory):
+        super().__init__(3 * (patch_size**2), embed_dim, bias=bias, **factory)
+        self.patch_size = patch_size
+
+
+class RadioInternVisionModel(nn.Module):
+    packed_modules_mapping = {
+        "qkv": ["qkv"],
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig = None,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.img_size, self.grid_size, self.num_patches = self._init_img_size(
+            to_2tuple(config.patch_size), config.image_size
+        )
+        max_img_size = int(
+            round(config.max_img_size / config.patch_size) * config.patch_size
+        )
+        self.patch_generator = ViTPatchGenerator(
+            config.patch_size,
+            config.hidden_size,
+            input_dims=self.img_size,
+            max_input_dims=max_img_size,
+            cls_token=True,
+            register_multiple=config.reg_tokens,
+        )
+
+        self.encoder = InternVisionEncoder(config=config, quant_config=quant_config)
+
+    def _init_img_size(self, patch_size, img_size: int | tuple[int, int]):
+        if img_size is None:
+            return None, None, None
+        img_size = to_2tuple(img_size)
+        grid_size = tuple([s // p for s, p in zip(img_size, patch_size)])
+        num_patches = grid_size[0] * grid_size[1]
+        return img_size, grid_size, num_patches
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(self, x: torch.Tensor) -> torch.FloatTensor:
+        assert self.patch_generator is not None
+        hidden_states = self.patch_generator(x)
+        encoder_outputs = self.encoder.forward(inputs_embeds=hidden_states)
+        assert isinstance(encoder_outputs, BaseModelOutput)
+        return encoder_outputs.last_hidden_state
+
+
+class RadioModel(nn.Module):
+    packed_modules_mapping = {
+        "qkv": ["qkv"],
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.model = RadioInternVisionModel(
+            config=config,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
+    ) -> torch.FloatTensor:
+        y = self.model(pixel_values)
+        return self._extract_final(y)
+
+    def load_weights(self, weights) -> set[str]:
+        remap_substrings = {
+            "attn": "attn.attn",
+            "qkv": "qkv_proj",
+            "blocks": "encoder.layers",
+        }
+        remap_prefixes = {
+            "radio_model.": "",
+        }
+
+        loaded_params: set[str] = set()
+        params_dict = dict(self.named_parameters())
+
+        if isinstance(weights, dict):
+            weights_list = list(weights.items())
+        else:
+            weights_list = list(weights)
+
+        for name, weight in weights_list:
+            if not name.startswith("radio_model."):
+                # Skip non-radio weights
+                continue
+            name = replace_substrings(name, remap_substrings)
+            name = replace_prefix(name, remap_prefixes)
+            if name and name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, weight)
+                loaded_params.add(name)
+
+        return loaded_params
+
+    def _extract_final(self, y: torch.Tensor):
+        # Remove CLS + REGISTERS tokens
+        patch_gen = getattr(self.model, "patch_generator", None)
+        if patch_gen is not None:
+            all_feat = y[:, patch_gen.num_skip :]
+
+        return all_feat
diff --git a/benchmark/mmmu/internvl_utils.py b/python/sglang/srt/multimodal/internvl_utils.py
similarity index 78%
rename from benchmark/mmmu/internvl_utils.py
rename to python/sglang/srt/multimodal/internvl_utils.py
index 44c62c99aa41..0fbef1c7c048 100644
--- a/benchmark/mmmu/internvl_utils.py
+++ b/python/sglang/srt/multimodal/internvl_utils.py
@@ -8,14 +8,18 @@
 IMAGENET_STD = (0.229, 0.224, 0.225)
 
 
-def build_transform(input_size):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+def build_transform(
+    input_size,
+    *,
+    mean: tuple[float, float, float],
+    std: tuple[float, float, float],
+):
     transform = T.Compose(
         [
             T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
             T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
             T.ToTensor(),
-            T.Normalize(mean=MEAN, std=STD),
+            T.Normalize(mean=mean, std=std),
         ]
     )
     return transform
@@ -38,8 +42,13 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
 
 
 def dynamic_preprocess(
-    image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
-):
+    image: Image.Image,
+    *,
+    min_num: int,
+    max_num: int,
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
     orig_width, orig_height = image.size
     aspect_ratio = orig_width / orig_height
 
@@ -83,12 +92,24 @@ def dynamic_preprocess(
     return processed_images
 
 
-def load_image(image_file, input_size=448, max_num=12):
-    image = Image.open(image_file).convert("RGB")
-    transform = build_transform(input_size=input_size)
+def image_to_pixel_values(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num_tiles: int = 1,
+    max_num_tiles: int,
+    use_thumbnail: bool,
+    mean: tuple[float, float, float] = IMAGENET_MEAN,
+    std: tuple[float, float, float] = IMAGENET_STD,
+) -> torch.Tensor:
     images = dynamic_preprocess(
-        image, image_size=input_size, use_thumbnail=True, max_num=max_num
+        image,
+        min_num=min_num_tiles,
+        max_num=max_num_tiles,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
     )
+    transform = build_transform(input_size, mean=mean, std=std)
     pixel_values = [transform(image) for image in images]
     pixel_values = torch.stack(pixel_values)
     return pixel_values
diff --git a/python/sglang/srt/multimodal/processors/nano_nemotron_vl.py b/python/sglang/srt/multimodal/processors/nano_nemotron_vl.py
new file mode 100644
index 000000000000..cb0ccad67b14
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/nano_nemotron_vl.py
@@ -0,0 +1,197 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+from PIL import Image
+
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.nano_nemotron_vl import NemotronH_Nano_VL_V2
+from sglang.srt.multimodal.internvl_utils import image_to_pixel_values
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+from sglang.srt.utils.common import sample_video_frames
+
+if TYPE_CHECKING:
+    from decord import VideoReader
+
+DEFAULT_NUM_TILES = 12
+NUM_VIDEO_TILES = 1
+DESIRED_FPS = 2  # TODO: allow desired fps/num frames to be configurable
+MAX_FRAMES = 128
+
+
+class NanoNemotronVLImageProcessor(BaseMultimodalProcessor):
+    models = [NemotronH_Nano_VL_V2]
+
+    def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
+        Image.MAX_IMAGE_PIXELS = None
+        self.image_size = hf_config.image_size
+        self.VIDEO_CONTEXT_TOKEN = hf_config.video_context_token
+        self.IMG_CONTEXT_TOKEN = hf_config.img_context_token
+        self.IMG_START_TOKEN = hf_config.img_start_token
+        self.IMG_END_TOKEN = hf_config.img_end_token
+        self.num_image_token = int(
+            (self.image_size // hf_config.patch_size) ** 2
+            * (hf_config.downsample_ratio**2)
+        )
+        if hasattr(self._processor, "tokenizer"):
+            tokenizer = self._processor.tokenizer
+        else:
+            tokenizer = self._processor
+        self.tokenizer = tokenizer
+
+        self.img_start_token_id = tokenizer.convert_tokens_to_ids(self.IMG_START_TOKEN)
+        self.img_end_token_id = tokenizer.convert_tokens_to_ids(self.IMG_END_TOKEN)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self.IMG_CONTEXT_TOKEN,
+            image_token_id=tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN),
+            video_token=self.VIDEO_CONTEXT_TOKEN,
+            video_token_id=tokenizer.convert_tokens_to_ids(self.VIDEO_CONTEXT_TOKEN),
+        ).build(_image_processor)
+
+        # Normalization config (mean/std) and tiling behavior
+        self.norm_mean = hf_config.norm_mean
+        self.norm_std = hf_config.norm_std
+        self.use_thumbnail = hf_config.use_thumbnail
+
+        self.PLACEHOLDER = self.tokenizer.unk_token
+        assert isinstance(self.PLACEHOLDER, str)
+        self.PLACEHOLDER_ID = tokenizer.convert_tokens_to_ids(self.PLACEHOLDER)
+        assert isinstance(self.PLACEHOLDER_ID, int)
+
+    def preprocess_image(
+        self, image: Image.Image, *, max_num_tiles: int = DEFAULT_NUM_TILES
+    ) -> torch.Tensor:
+        return image_to_pixel_values(
+            image,
+            input_size=self.image_size,
+            max_num_tiles=max_num_tiles,
+            use_thumbnail=self.use_thumbnail,
+            mean=self.norm_mean,
+            std=self.norm_std,
+        ).to(dtype=torch.bfloat16)
+
+    def render_image(self, *, num_tiles: int):
+        return f"{self.IMG_START_TOKEN}{self.IMG_CONTEXT_TOKEN * self.num_image_token * num_tiles}{self.IMG_END_TOKEN}"
+
+    def render_frame(
+        self, frame_index: int, *, timestamp: float, start_placeholder_token: str
+    ):
+        return f"Frame {frame_index + 1} sampled at {timestamp:.2f} seconds: {start_placeholder_token}{self.IMG_CONTEXT_TOKEN * self.num_image_token}{self.IMG_END_TOKEN}"
+
+    @staticmethod
+    def parse_video(video: "VideoReader") -> tuple[np.ndarray, list[float]]:
+        frames = sample_video_frames(
+            video, desired_fps=DESIRED_FPS, max_frames=MAX_FRAMES
+        )
+        video_array = video.get_batch(frames).asnumpy()
+        # doing the `1000 /` and then `/ 1000` is to match vllm's timestamping *exactly*, for reference.
+        frame_duration_ms = int(1000 / video.get_avg_fps())
+        timestamps = [i * frame_duration_ms / 1000.0 for i in frames]
+        return video_array, timestamps
+
+    async def process_mm_data_async(
+        self, image_data, input_text, request_obj, **kwargs
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            video_data=request_obj.video_data,
+            multimodal_tokens=self.mm_tokens,
+            discard_alpha_channel=True,
+        )
+
+        prompt = input_text
+
+        image_feature = None
+        if base_output.images:
+            preprocessed_images = [
+                self.preprocess_image(image) for image in base_output.images
+            ]
+            rendered_images = [
+                self.render_image(num_tiles=image.shape[0])
+                for image in preprocessed_images
+            ]
+            prompt = prompt.replace(self.IMG_CONTEXT_TOKEN, "".join(rendered_images), 1)
+            image_feature = torch.cat(preprocessed_images, dim=0)
+
+        video_feature = None
+        if base_output.videos:
+            preprocessed_videos = []
+            for video in base_output.videos:
+                video_array, timestamps = self.parse_video(video)
+                frames_tensors = [
+                    self.preprocess_image(
+                        Image.fromarray(frame, mode="RGB"),
+                        max_num_tiles=NUM_VIDEO_TILES,
+                    )
+                    for frame in video_array
+                ]
+                preprocessed_video = torch.cat(frames_tensors, dim=0)
+                preprocessed_videos.append(preprocessed_video)
+                rendered_frames = [
+                    self.render_frame(
+                        i,
+                        timestamp=timestamp,
+                        start_placeholder_token=self.PLACEHOLDER,
+                    )
+                    for i, timestamp in enumerate(timestamps)
+                ]
+                prompt = prompt.replace(
+                    self.VIDEO_CONTEXT_TOKEN, "".join(rendered_frames), 1
+                )
+            video_feature = torch.cat(preprocessed_videos, dim=0)
+
+        prompt_ids = self.tokenizer(
+            prompt, add_special_tokens=False, return_tensors="pt"
+        )["input_ids"].flatten()
+        offsets = self.get_mm_items_offset(prompt_ids, self.mm_tokens.image_token_id)
+        img_offsets = [
+            (start, end)
+            for start, end in offsets
+            if prompt_ids[start - 1] == self.img_start_token_id
+        ]
+        video_offsets = [
+            (start, end)
+            for start, end in offsets
+            if prompt_ids[start - 1] == self.PLACEHOLDER_ID
+        ]
+        # Cleanup:
+        prompt_ids[prompt_ids == self.PLACEHOLDER_ID] = self.img_start_token_id
+
+        items = []
+        if image_feature is not None:
+            item = MultimodalDataItem(
+                Modality.IMAGE, feature=image_feature, offsets=img_offsets
+            )
+            items.append(item)
+        if video_feature is not None:
+            item = MultimodalDataItem(
+                Modality.VIDEO, feature=video_feature, offsets=video_offsets
+            )
+            items.append(item)
+
+        return {
+            "input_ids": prompt_ids.tolist(),
+            "mm_items": items,
+            "im_start_id": self.img_start_token_id,
+            "im_end_id": self.img_end_token_id,
+            "im_token_id": self.mm_tokens.image_token_id,
+            "video_token_id": self.mm_tokens.image_token_id,
+        }
diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py
index 4d011106cd0e..3778ee39f0e9 100644
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -27,6 +27,7 @@
 import itertools
 import json
 import logging
+import math
 import os
 import pickle
 import platform
@@ -96,6 +97,9 @@
 from sglang.srt.metrics.func_timer import enable_func_timer
 
 if TYPE_CHECKING:
+    # Apparently importing this here is necessary to avoid a segfault, see comment in load_video below
+    from decord import VideoReader
+
     from sglang.srt.server_args import ServerArgs
 
 logger = logging.getLogger(__name__)
@@ -994,6 +998,24 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
             os.unlink(tmp_file.name)
 
 
+def sample_video_frames(
+    video: "VideoReader", *, desired_fps: int, max_frames: int
+) -> list[int]:
+    total_frames = len(video)
+    assert total_frames > 0, "Video must have at least one frame"
+
+    duration = total_frames / video.get_avg_fps()
+    fps = min(desired_fps, video.get_avg_fps())
+
+    num_frames = math.floor(duration * fps)
+    num_frames = min(max_frames, num_frames, total_frames)
+    num_frames = max(1, num_frames)  # At least one frame
+    if num_frames == total_frames:
+        return list(range(total_frames))
+    else:
+        return np.linspace(0, total_frames - 1, num_frames, dtype=int).tolist()
+
+
 def encode_video(video_path, frame_count_limit=None):
     # Lazy import because decord is not available on some arm platforms.
     from decord import VideoReader, cpu
diff --git a/python/sglang/srt/utils/hf_transformers_utils.py b/python/sglang/srt/utils/hf_transformers_utils.py
index aa1c46820a1e..0e71dfb31383 100644
--- a/python/sglang/srt/utils/hf_transformers_utils.py
+++ b/python/sglang/srt/utils/hf_transformers_utils.py
@@ -50,6 +50,7 @@
     KimiVLConfig,
     LongcatFlashConfig,
     MultiModalityConfig,
+    NemotronH_Nano_VL_V2_Config,
     NemotronHConfig,
     Olmo3Config,
     Qwen3NextConfig,
@@ -77,6 +78,7 @@
     FalconH1Config,
     DotsVLMConfig,
     DotsOCRConfig,
+    NemotronH_Nano_VL_V2_Config,
     NemotronHConfig,
     DeepseekVLV2Config,
     JetNemotronConfig,
@@ -144,6 +146,8 @@ def get_hf_text_config(config: PretrainedConfig):
             )
             return thinker_config.text_config
         return thinker_config
+    if hasattr(config, "llm_config"):
+        return config.llm_config
     else:
         return config
 
diff --git a/test/srt/models/test_nvidia_nemotron_nano_v2_vl.py b/test/srt/models/test_nvidia_nemotron_nano_v2_vl.py
new file mode 100644
index 000000000000..a3621841099d
--- /dev/null
+++ b/test/srt/models/test_nvidia_nemotron_nano_v2_vl.py
@@ -0,0 +1,31 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.test.gsm8k_mixin import GSM8KMixin
+from sglang.test.mmmu_vlm_mixin import MMMUVLMMixin
+from sglang.test.test_utils import CustomTestCase
+
+MODEL = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"
+
+
+class TestNvidiaNemotronNanoV2VLTextOnly(GSM8KMixin, CustomTestCase):
+    accuracy = 0.87
+    model = MODEL
+    other_args = ["--max-mamba-cache-size", "256", "--trust-remote-code"]
+
+
+class TestNvidiaNemotronNanoV2VLMMMU(MMMUVLMMixin, CustomTestCase):
+    accuracy = 0.454
+    model = MODEL
+    other_args = ["--max-mamba-cache-size", "128", "--trust-remote-code"]
+    mmmu_args = ["--limit=0.1"]
+    """`--limit=0.1`: 10 percent of each task - this is fine for testing since the nominal result isn't interesting - this run is just to prevent relative regressions."""
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test(
+            SimpleNamespace(model=self.model, mmmu_accuracy=self.accuracy), "./logs"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index e885a659dd8b..2f4c9bf47d83 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -24,6 +24,7 @@
         TestFile("models/test_encoder_embedding_models.py", 460),
         TestFile("models/test_generation_models.py", 103),
         TestFile("models/test_nvidia_nemotron_nano_v2.py", 160),
+        TestFile("models/test_nvidia_nemotron_nano_v2_vl.py", 350),  # GSM8k + MMMU
         TestFile("models/test_qwen_models.py", 150),
         TestFile("models/test_reward_models.py", 132),
         TestFile("models/test_transformers_models.py", 320),
@@ -125,6 +126,7 @@
         TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
         TestFile("test_triton_sliding_window.py", 100),
         TestFile("test_utils_update_weights.py", 48),
+        TestFile("test_video_utils.py", 5),
         TestFile("test_vision_chunked_prefill.py", 170),
         TestFile("test_vision_openai_server_a.py", 900),
         TestFile("test_vlm_input_format.py", 300),
diff --git a/test/srt/test_video_utils.py b/test/srt/test_video_utils.py
new file mode 100644
index 000000000000..af21a6fb8a84
--- /dev/null
+++ b/test/srt/test_video_utils.py
@@ -0,0 +1,59 @@
+from dataclasses import dataclass
+
+import pytest
+
+from sglang.srt.utils import sample_video_frames
+
+
+class DummyVideo:
+    def __init__(self, total_frames: int, avg_fps: float):
+        self._frames = total_frames
+        self._fps = avg_fps
+
+    def __len__(self):
+        return self._frames
+
+    def get_avg_fps(self):
+        return self._fps
+
+
+@dataclass(kw_only=True)
+class Case:
+    frames: int
+    avg_fps: float
+    desired_fps: int
+    max_frames: int
+    expected_frames: list[int]
+    description: str
+
+
+# fmt: off
+@pytest.mark.parametrize("case", [
+    Case(
+        frames=100, avg_fps=25.0, desired_fps=5, max_frames=200,
+        expected_frames=[0, 5, 10, 15, 20, 26, 31, 36, 41, 46, 52, 57, 62, 67, 72, 78, 83, 88, 93, 99],
+        description="capped by desired_fps"
+    ),
+    Case(
+        frames=10, avg_fps=10.0, desired_fps=100, max_frames=5,
+        expected_frames=[0, 2, 4, 6, 9],
+        description="capped by max_frames"
+    ),
+    Case(
+        frames=50, avg_fps=25.0, desired_fps=50, max_frames=200,
+        expected_frames=list(range(50)),
+        description="capped by total_frames"
+    ),
+    Case(
+        frames=1, avg_fps=30.0, desired_fps=0, max_frames=0,
+        expected_frames=[0],
+        description="always sample at least 1 frame"
+    )
+],     ids=lambda c: c.description)
+def test_sample_video_frames_lengths(case: Case):
+    video = DummyVideo(case.frames, case.avg_fps)
+    result = sample_video_frames(video, desired_fps=case.desired_fps, max_frames=case.max_frames)
+    assert result == case.expected_frames
+
+if __name__ == "__main__":
+    pytest.main([__file__])