diff --git a/benchmark/mmmu/bench_hf.py b/benchmark/mmmu/bench_hf.py index 949b63b802a7..c841f44466d7 100644 --- a/benchmark/mmmu/bench_hf.py +++ b/benchmark/mmmu/bench_hf.py @@ -36,9 +36,10 @@ def eval_mmmu(args): try: # check if the model is belongs to internvl if "InternVL" in args.model_path: - from internvl_utils import load_image from transformers import AutoTokenizer + from sglang.srt.multimodal.internvl_utils import image_to_pixel_values + tokenizer = AutoTokenizer.from_pretrained(args.model_path) model = AutoModel.from_pretrained( args.model_path, @@ -80,7 +81,11 @@ def eval_mmmu(args): assert image is not None if "InternVL" in args.model_path: - pixel_values = load_image(sample["image_path"]).to(torch.bfloat16).cuda() + image = PIL.Image.open(sample["image_path"]).convert("RGB") + pixel_values = image_to_pixel_values( + image, input_size=448, max_num=12, use_thumbnail=True + ) + pixel_values = pixel_values.to(device="cuda", dtype=torch.bfloat16) contents = "" if prefix: contents += prefix diff --git a/docs/supported_models/multimodal_language_models.md b/docs/supported_models/multimodal_language_models.md index 3414d6c48d3a..90c877518de0 100644 --- a/docs/supported_models/multimodal_language_models.md +++ b/docs/supported_models/multimodal_language_models.md @@ -45,6 +45,7 @@ in the GitHub search bar. | **DotsVLM** (General/OCR) | `rednote-hilab/dots.vlm1.inst` | RedNote's vision-language model built on a 1.2B vision encoder and DeepSeek V3 LLM, featuring NaViT vision encoder trained from scratch with dynamic resolution support and enhanced OCR capabilities through structured image data training. | | | **DotsVLM-OCR** | `rednote-hilab/dots.ocr` | Specialized OCR variant of DotsVLM optimized for optical character recognition tasks with enhanced text extraction and document understanding capabilities. | Don't use `--trust-remote-code` | | **NVILA** (8B, 15B, Lite-2B, Lite-8B, Lite-15B) | `Efficient-Large-Model/NVILA-8B` | `chatml` | NVILA explores the full stack efficiency of multi-modal design, achieving cheaper training, faster deployment and better performance. | +| **NVIDIA Nemotron Nano 2.0 VL** | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` | NVIDIA Nemotron Nano v2 VL enables multi-image reasoning and video understanding, along with strong document intelligence, visual Q&A and summarization capabilities. It builds on Nemotron Nano V2, a hybrid Mamba-Transformer LLM, in order to achieve higher inference throughput in long document and video scenarios. | Use `--trust-remote-code`. You may need to adjust `--max-mamba-cache-size` [default is 512] to fit memory constraints. | | **JetVLM** | | JetVLM is an vision-language model designed for high-performance multimodal understanding and generation tasks built upon Jet-Nemotron. | Coming soon | ## Video Input Support @@ -57,6 +58,7 @@ SGLang supports video input for Vision-Language Models (VLMs), enabling temporal | **GLM-4v** (4.5V, 4.1V, MOE) | `zai-org/GLM-4.5V` | Video clips are read with Decord, converted to tensors, and passed to the model alongside metadata for rotary-position handling. | | **NVILA** (Full & Lite) | `Efficient-Large-Model/NVILA-8B` | The runtime samples eight frames per clip and attaches them to the multimodal request when `video_data` is present. | | **LLaVA video variants** (LLaVA-NeXT-Video, LLaVA-OneVision) | `lmms-lab/LLaVA-NeXT-Video-7B` | The processor routes video prompts to the LlavaVid video-enabled architecture, and the provided example shows how to query it with `sgl.video(...)` clips. | +| **NVIDIA Nemotron Nano 2.0 VL** | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` | For video, the processor is configured to sample at 2 FPS, at a max of 128 frames, as per model training. | | **JetVLM** | | The runtime samples eight frames per clip and attaches them to the multimodal request when `video_data` is present. | Use `sgl.video(path, num_frames)` when building prompts to attach clips from your SGLang programs. diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py index 623d91d7352f..b35cc1dc5f23 100644 --- a/python/sglang/srt/configs/__init__.py +++ b/python/sglang/srt/configs/__init__.py @@ -12,6 +12,7 @@ from sglang.srt.configs.kimi_vl import KimiVLConfig from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig from sglang.srt.configs.longcat_flash import LongcatFlashConfig +from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config from sglang.srt.configs.nemotron_h import NemotronHConfig from sglang.srt.configs.olmo3 import Olmo3Config from sglang.srt.configs.qwen3_next import Qwen3NextConfig @@ -40,6 +41,7 @@ "DotsOCRConfig", "FalconH1Config", "NemotronHConfig", + "NemotronH_Nano_VL_V2_Config", "JetNemotronConfig", "JetVLMConfig", ] diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 446533f53fb3..7f1f3e472c6f 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -938,6 +938,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal "Mistral3ForConditionalGeneration", "MultiModalityCausalLM", "MllamaForConditionalGeneration", + "NemotronH_Nano_VL_V2", "Qwen2AudioForConditionalGeneration", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration", diff --git a/python/sglang/srt/configs/nano_nemotron_vl.py b/python/sglang/srt/configs/nano_nemotron_vl.py new file mode 100644 index 000000000000..09ab29abf465 --- /dev/null +++ b/python/sglang/srt/configs/nano_nemotron_vl.py @@ -0,0 +1,114 @@ +# Copyright 2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Adapted from https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/blob/cb5a65ff10232128389d882d805fa609427544f1/configuration.py + +from typing import Any + +from transformers.configuration_utils import PretrainedConfig + +from sglang.srt.configs.nemotron_h import NemotronHConfig +from sglang.srt.configs.radio import RadioConfig +from sglang.srt.multimodal.internvl_utils import IMAGENET_MEAN, IMAGENET_STD + + +def float_triplet(seq: Any): + a, b, c = tuple(seq) + assert ( + isinstance(a, float) and isinstance(b, float) and isinstance(c, float) + ), "expected three floats" + return a, b, c + + +class NemotronH_Nano_VL_V2_Config(PretrainedConfig): + model_type = "NemotronH_Nano_VL_V2" + is_composition = True + + def __init__( + self, + vision_config=None, + llm_config=None, + force_image_size: int = 512, + patch_size: int = 16, + downsample_ratio=0.5, + template=None, + ps_version="v2", + image_tag_type="internvl", + projector_hidden_size=4096, + vit_hidden_size=1280, + video_pruning_rate: float = 0.0, + video_context_token: str = "